Unverified Commit 8f5f6496 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

backward data (#7)

* enabled atomic add in tensor copy
* added gridwise GEMM
* added backward data conv using GEMM + atomic
* added backward data conv using GEMM, no atomic
parent 31ded4ac
...@@ -33,7 +33,15 @@ namespace ck { ...@@ -33,7 +33,15 @@ namespace ck {
enum AddressSpace enum AddressSpace
{ {
generic, generic,
global = generic global,
lds,
vgpr
};
enum InMemoryDataOperation
{
none,
atomic_add
}; };
#if CK_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
......
...@@ -64,9 +64,8 @@ struct static_if<true> ...@@ -64,9 +64,8 @@ struct static_if<true>
} }
template <typename F> template <typename F>
__host__ __device__ static constexpr auto Else(F) __host__ __device__ static void Else(F)
{ {
return Type{};
} }
}; };
...@@ -82,14 +81,13 @@ struct static_if<false> ...@@ -82,14 +81,13 @@ struct static_if<false>
} }
template <typename F> template <typename F>
__host__ __device__ static constexpr auto Else(F f) __host__ __device__ static void Else(F f)
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being // this will make "f" a generic lambda, so that "f" won't be compiled until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
return Type{};
} }
}; };
......
#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
#define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf, instead of nested static_if
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#endif
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
})
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
#ifndef CK_IN_MEMORY_OPERATION_NVIDIA_HPP
#define CK_IN_MEMORY_OPERATION_NVIDIA_HPP
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
})
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
...@@ -97,12 +97,57 @@ __host__ __device__ constexpr T min(T x, Ts... xs) ...@@ -97,12 +97,57 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return x < y ? x : y; return x < y ? x : y;
} }
// this is WRONG // highest common factor
// TODO: implement least common multiple properly, instead of calling max() template <typename T>
template <class T, class... Ts> __host__ __device__ constexpr T hcf(T x, T y)
__host__ __device__ constexpr T lcm(T x, Ts... xs) {
if(x == 0)
{
return y;
}
if(y == 0)
{
return x;
}
if(x == y)
{
return x;
}
if(x > y)
{
return hcf(x - y, y);
}
return hcf(x, y - x);
}
template <index_t X, index_t Y>
__host__ __device__ constexpr auto hcf(Number<X>, Number<Y>)
{
constexpr auto result = hcf(X, Y);
return Number<result>{};
}
template <typename X, typename... Ys>
__host__ __device__ constexpr auto hcf(X x, Ys... ys)
{
return hcf(x, ys...);
}
// least common multiple
template <typename T>
__host__ __device__ constexpr T lcm(T x, T y)
{
return (x * y) / hcf(x, y);
}
template <typename X, typename Y, typename... Zs>
__host__ __device__ constexpr auto lcm(X x, Y y, Zs... zs)
{ {
return max(x, xs...); return lcm(x, lcm(y, zs...));
} }
template <class T> template <class T>
......
...@@ -15,10 +15,18 @@ install(TARGETS host LIBRARY DESTINATION lib) ...@@ -15,10 +15,18 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD") if(DEVICE_BACKEND STREQUAL "AMD")
set(DRIVER_SOURCE src/driver.cpp) set(CONV_SOURCE src/conv_driver.cpp)
set(COL2IM_SOURCE src/col2im_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(DRIVER_SOURCE src/driver.cu) set(CONV_SOURCE src/conv_driver.cu)
set(COL2IM_SOURCE src/col2im_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
endif() endif()
add_executable(driver ${DRIVER_SOURCE}) add_executable(conv ${CONV_SOURCE})
target_link_libraries(driver PRIVATE host) add_executable(col2im ${COL2IM_SOURCE})
add_executable(conv_bwd_data ${CONV_BWD_DATA_SOURCE})
target_link_libraries(conv PRIVATE host)
target_link_libraries(col2im PRIVATE host)
target_link_libraries(conv_bwd_data PRIVATE host)
...@@ -2,10 +2,16 @@ ...@@ -2,10 +2,16 @@
#define CONV_COMMON_HPP #define CONV_COMMON_HPP
#include "ConstantTensorDescriptor_deprecated.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
// this is ugly, only for 4d template <class InDesc,
template <class InDesc, class WeiDesc> class WeiDesc,
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) class ConvStrides,
class ConvDilations,
class LowerPads,
class UpperPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated(
InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
{ {
using namespace ck; using namespace ck;
...@@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe ...@@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent"); "input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0); constexpr index_t N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2); constexpr index_t Hi = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3); constexpr index_t Wi = in_desc.GetLength(I3);
constexpr index_t K = wei_desc.GetLength(I0);
constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3);
constexpr index_t HPadLow = LowerPads{}.Get(I0);
constexpr index_t WPadLow = LowerPads{}.Get(I1);
constexpr auto K = wei_desc.GetLength(I0); constexpr index_t HPadUp = UpperPads{}.Get(I0);
constexpr auto Y = wei_desc.GetLength(I2); constexpr index_t WPadUp = UpperPads{}.Get(I1);
constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HO = HI + 1 - Y; constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
constexpr auto WO = WI + 1 - X; constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
} }
template <class InDesc, template <class InDesc,
...@@ -42,7 +57,7 @@ template <class InDesc, ...@@ -42,7 +57,7 @@ template <class InDesc,
class ConvDilations, class ConvDilations,
class LowerPads, class LowerPads,
class UpperPads> class UpperPads>
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( constexpr auto get_convolution_output_default_4d_tensor_descriptor(
InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads) InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
{ {
using namespace ck; using namespace ck;
...@@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( ...@@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1; constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1; constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{}); return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_col2im_eb_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_col2im_eb_nchw(ColDesc,
const Tensor<T>& col_eb,
ImgDesc,
Tensor<T>& img_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto col_eb_desc = ColDesc{};
constexpr auto img_nchw_desc = ImgDesc{};
constexpr index_t N = img_nchw_desc.GetLengths()[0];
constexpr index_t C = img_nchw_desc.GetLengths()[1];
constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
constexpr index_t E = col_eb_desc.GetLengths()[0];
constexpr index_t B = col_eb_desc.GetLengths()[1];
std::size_t data_sz = sizeof(T);
DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
col_eb_device_buf.ToDevice(col_eb.mData.data());
img_nchw_device_buf.ToDevice(img_nchw.mData.data());
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t EPerBlock = 128;
constexpr index_t BPerBlock = 128;
using BlockCopySubLengths_E_B = Sequence<8, 8>;
using BlockCopyClusterLengths_E_B = Sequence<16, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t BlockCopyDataPerAccess_B = 1;
#endif
constexpr index_t GridSize =
((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
BlockSize,
T,
ColDesc,
ImgDesc,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
EPerBlock,
BPerBlock,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_B>{};
for(index_t i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
gridwise_col2im,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
img_nchw_device_buf.FromDevice(img_nchw.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
template <typename T,
typename InDesc,
typename WeiDesc,
typename OutDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
Tensor<T>& in_nchw,
WeiDesc wei_kcyx_desc,
const Tensor<T>& wei_kcyx,
OutDesc out_nkhw_desc,
const Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr index_t N = out_nkhw_desc.GetLengths()[0];
constexpr index_t K = out_nkhw_desc.GetLengths()[1];
constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadGemmDataPerReadM = 4;
constexpr index_t GemmThreadGemmDataPerReadN = 4;
using GemmABlockCopySubLengths = Sequence<1, 4>; // Gemm-K, Gemm-M
using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N
using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
#endif
constexpr index_t GemmM = C * Y * X;
constexpr index_t GemmN = N * Ho * Wo;
constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw<
GridSize,
BlockSize,
T,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
GemmMPerBlock,
GemmNPerBlock,
GemmKPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmThreadGemmDataPerReadM,
GemmThreadGemmDataPerReadN,
GemmABlockCopySubLengths,
GemmABlockCopyClusterLengths,
GemmABlockCopyDataPerAccess,
GemmBBlockCopySubLengths,
GemmBBlockCopyClusterLengths,
GemmBBlockCopyDataPerAccess,
GemmCThreadCopyDataPerAccess>{};
for(index_t i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
T* const __restrict__,
const T* const __restrict__,
const T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
gridwise_conv,
const_cast<T* const __restrict__>(
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
in_nchw_device_buf.FromDevice(in_nchw.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template <typename T,
typename InDesc,
typename WeiDesc,
typename OutDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc in_nchw_desc,
Tensor<T>& in_nchw,
WeiDesc wei_kcyx_desc,
const Tensor<T>& wei_kcyx,
OutDesc out_nkhw_desc,
const Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr index_t N = out_nkhw_desc.GetLengths()[0];
constexpr index_t K = out_nkhw_desc.GetLengths()[1];
constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 32;
constexpr index_t EPerBlock = 32;
constexpr index_t KPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using OutBlockCopySubLengths_K_B_N0 = Sequence<1, 1, 4>;
using OutBlockCopyClusterLengths_K_B_N0 = Sequence<8, 32, 1>;
constexpr index_t OutBlockCopySrcDataPerRead_B = 1;
constexpr index_t OutBlockCopyDstDataPerWrite_N0 = 4;
using WeiBlockCopySubLengths_K_E_C0 = Sequence<1, 4, 1>;
using WeiBlockCopyClusterLengths_K_E_C0 = Sequence<8, 8, 4>;
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_C0 = 1;
constexpr index_t InThreadCopyDstDataPerWrite_B = 1;
#endif
constexpr index_t C0 = GemmMPerThreadSubC;
constexpr index_t N0 = GemmNPerThreadSubC;
constexpr index_t C1 = C / C0;
constexpr index_t N1 = N / N0;
constexpr index_t E = C1 * Y * X;
constexpr index_t B = (N1 * Ho * Wo);
constexpr index_t GridSize =
((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv =
GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
T,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
EPerBlock,
BPerBlock,
KPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
OutBlockCopySubLengths_K_B_N0,
OutBlockCopyClusterLengths_K_B_N0,
OutBlockCopySrcDataPerRead_B,
OutBlockCopyDstDataPerWrite_N0,
WeiBlockCopySubLengths_K_E_C0,
WeiBlockCopyClusterLengths_K_E_C0,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_C0,
InThreadCopyDstDataPerWrite_B>{};
for(index_t i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
T* const __restrict__,
const T* const __restrict__,
const T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
gridwise_conv,
const_cast<T* const __restrict__>(
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
in_nchw_device_buf.FromDevice(in_nchw.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
template <typename T,
typename InDesc,
typename WeiDesc,
typename OutDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
Tensor<T>& in_nchw,
WeiDesc wei_kcyx_desc,
const Tensor<T>& wei_kcyx,
OutDesc out_nkhw_desc,
const Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr index_t N = out_nkhw_desc.GetLengths()[0];
constexpr index_t K = out_nkhw_desc.GetLengths()[1];
constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
constexpr index_t ConvStrideH = ConvStrides{}[0];
constexpr index_t ConvStrideW = ConvStrides{}[1];
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadGemmDataPerReadM = 4;
constexpr index_t GemmThreadGemmDataPerReadN = 4;
using GemmABlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-M
using GemmABlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-M
constexpr index_t GemmABlockCopyDataPerAccess = 1; // Gemm-M
using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N
using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
#elif 0
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadGemmDataPerReadM = 4;
constexpr index_t GemmThreadGemmDataPerReadN = 4;
using GemmABlockCopySubLengths = Sequence<1, 4>; // Gemm-K, Gemm-M
using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N
using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
#endif
// TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for
// simplicity
constexpr index_t hcf_stride_dilation_h = math::hcf(ConvStrideH, ConvDilationH);
constexpr index_t hcf_stride_dilation_w = math::hcf(ConvStrideW, ConvDilationW);
constexpr index_t Ytilda = ConvStrideH / hcf_stride_dilation_h; // may be wrong
constexpr index_t Xtilda = ConvStrideW / hcf_stride_dilation_w; // may be wrong
constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
constexpr index_t right_pad_ho = (ConvDilationH / hcf_stride_dilation_h) * (Y - Ytilda);
constexpr index_t right_pad_wo = (ConvDilationW / hcf_stride_dilation_w) * (X - Xtilda);
constexpr index_t Htilda = Ho + right_pad_ho;
constexpr index_t Wtilda = Wo + right_pad_wo;
constexpr index_t GemmK = K * Ydot * Xdot;
constexpr index_t GemmM = C * Ytilda * Xtilda;
constexpr index_t GemmN = N * Htilda * Wtilda;
constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw<
GridSize,
BlockSize,
T,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
GemmMPerBlock,
GemmNPerBlock,
GemmKPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmThreadGemmDataPerReadM,
GemmThreadGemmDataPerReadN,
GemmABlockCopySubLengths,
GemmABlockCopyClusterLengths,
GemmABlockCopyDataPerAccess,
GemmBBlockCopySubLengths,
GemmBBlockCopyClusterLengths,
GemmBBlockCopyDataPerAccess,
GemmCThreadCopyDataPerAccess>{};
for(index_t i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
T* const __restrict__,
const T* const __restrict__,
const T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
gridwise_conv,
const_cast<T* const __restrict__>(
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
in_nchw_device_buf.FromDevice(in_nchw.mData.data());
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_operation_wrapper.hpp"
#include "convolution_common.hpp" #include "convolution_common.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
...@@ -54,8 +54,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -54,8 +54,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 0
// BlockSize = 256, each thread hold 64 data // BlockSize = 256, EperBlock = 8, each thread hold 64 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16; constexpr index_t BPerBlock = 16;
...@@ -89,6 +89,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -89,6 +89,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1
// BlockSize = 256, EPerBlock = 16, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t EPerBlock = 16;
constexpr index_t GemmNRepeat = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>;
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<16, 1, 16, 1>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
using WeiBlockCopySubLengths_E_K = Sequence<4, 2>;
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0 #elif 0
...@@ -221,13 +258,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -221,13 +258,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
const T* const __restrict__,
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()), gridwise_conv,
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()), const_cast<const T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())); static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms, %f TFlop/s\n", printf("Elapsed time : %f ms, %f TFlop/s\n",
time, time,
......
...@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, ...@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 0
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, ...@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1 #elif 0
// BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, ...@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; constexpr index_t WeiBlockCopySrcDataPerRead_E = 2;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1
constexpr index_t BlockSize = 64;
constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 32;
constexpr index_t EPerBlock = 4;
constexpr index_t GemmNRepeat = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 1;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>;
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<4, 1, 16, 1>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
using WeiBlockCopySubLengths_E_K = Sequence<1, 2>;
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 16>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2;
#endif #endif
constexpr index_t N1 = GemmNRepeat; constexpr index_t N1 = GemmNRepeat;
......
...@@ -51,6 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -51,6 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 1
// BlockSize = 256, EPerBlock = 8
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128; constexpr index_t BPerBlock = 128;
...@@ -85,7 +86,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -85,7 +86,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t OutThreadCopyDataPerAccess_B = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 1;
#elif 1 #elif 0
// BlockSize = 256, EPerBlock = 8
// 1x1 filter, 8x8 image // 1x1 filter, 8x8 image
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -122,6 +124,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -122,6 +124,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t OutThreadCopyDataPerAccess_B = 4; constexpr index_t OutThreadCopyDataPerAccess_B = 4;
#elif 0 #elif 0
// BlockSize = 256, EPerBlock = 16
// 1x1 filter, 8x8 image
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128;
constexpr index_t KPerBlock = 128;
constexpr index_t EPerBlock = 16;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_B = Sequence<2, 4>;
using InBlockCopyClusterLengths_E_B = Sequence<8, 32>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t InBlockCopyDataPerAccess_B = 4;
using WeiBlockCopySubLengths_E_K = Sequence<4, 2>;
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t OutThreadCopyDataPerAccess_B = 4;
#elif 1
// 1x1 filter, 14x14 image // 1x1 filter, 14x14 image
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -167,47 +206,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -167,47 +206,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 0 GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer<
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded GridSize,
#else BlockSize,
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer T,
#endif decltype(in_nchw_desc),
<GridSize, decltype(wei_kcyx_desc),
BlockSize, decltype(out_nkhw_desc),
T, ConvStrides,
decltype(in_nchw_desc), ConvDilations,
decltype(wei_kcyx_desc), LeftPads,
decltype(out_nkhw_desc), RightPads,
ConvStrides, BPerBlock,
ConvDilations, KPerBlock,
LeftPads, EPerBlock,
RightPads, GemmMPerThreadSubC,
BPerBlock, GemmNPerThreadSubC,
KPerBlock, GemmMLevel0Cluster,
EPerBlock, GemmNLevel0Cluster,
GemmMPerThreadSubC, GemmMLevel1Cluster,
GemmNPerThreadSubC, GemmNLevel1Cluster,
GemmMLevel0Cluster, GemmKPerThreadLoop,
GemmNLevel0Cluster, GemmDataPerReadA,
GemmMLevel1Cluster, GemmDataPerReadB,
GemmNLevel1Cluster, InBlockCopySubLengths_E_B,
GemmKPerThreadLoop, InBlockCopyClusterLengths_E_B,
GemmDataPerReadA, InBlockCopyThreadClusterArrangeOrder,
GemmDataPerReadB, InBlockCopySrcAccessOrder,
InBlockCopySubLengths_E_B, InBlockCopyDstAccessOrder,
InBlockCopyClusterLengths_E_B, InBlockCopyDataPerAccess_B,
InBlockCopyThreadClusterArrangeOrder, WeiBlockCopySubLengths_E_K,
InBlockCopySrcAccessOrder, WeiBlockCopyClusterLengths_E_K,
InBlockCopyDstAccessOrder, WeiBlockCopyThreadClusterArrangeOrder,
InBlockCopyDataPerAccess_B, WeiBlockCopySrcAccessOrder,
WeiBlockCopySubLengths_E_K, WeiBlockCopyDstAccessOrder,
WeiBlockCopyClusterLengths_E_K, WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyThreadClusterArrangeOrder, WeiBlockCopyDstDataPerWrite_K,
WeiBlockCopySrcAccessOrder, OutThreadCopyDataPerAccess_B>{};
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B>{};
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
......
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
template <typename ConstTensorDesc, std::size_t... Is>
auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence<std::size_t, Is...>)
{
std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLengths()[Is]...};
std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStrides()[Is]...};
return TensorDescriptor(lengths, strides);
}
template <typename ConstTensorDesc>
auto make_TensorDescriptor(ConstTensorDesc)
{
return make_TensorDescriptor_impl(
ConstTensorDesc{},
std::make_integer_sequence<std::size_t, ConstTensorDesc::GetNumOfDimension()>{});
}
template <typename ConstTensorDesc>
void ostream_ConstantTensorDescriptor(ConstTensorDesc, std::ostream& os = std::cout)
{
ostream_TensorDescriptor(make_TensorDescriptor(ConstTensorDesc{}), os);
}
#pragma once
#include "tensor.hpp"
template <typename T,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void host_col2im(const Tensor<T>& in_eb,
Tensor<T>& in_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads)
{
using namespace ck;
int N = in_nchw.mDesc.GetLengths()[0];
int C = in_nchw.mDesc.GetLengths()[1];
int HI = in_nchw.mDesc.GetLengths()[2];
int WI = in_nchw.mDesc.GetLengths()[3];
int Y = FilterSizes{}[0];
int X = FilterSizes{}[1];
int HO = OutputSizes{}[0];
int WO = OutputSizes{}[1];
auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
{
int ho = h_tmp / ConvStrides{}[0];
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
{
int wo = w_tmp / ConvStrides{}[1];
int e = c * (Y * X) + y * X + x;
int b = n * (HO * WO) + ho * WO + wo;
v += in_eb(e, b);
}
}
}
}
in_nchw(n, c, hi, wi) = v;
};
auto f_par = make_ParallelTensorFunctor(f,
in_nchw.mDesc.GetLengths()[0],
in_nchw.mDesc.GetLengths()[1],
in_nchw.mDesc.GetLengths()[2],
in_nchw.mDesc.GetLengths()[3]);
f_par(std::thread::hardware_concurrency());
}
#pragma once #pragma once
#include "tensor.hpp" #include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class TConstTensorDesc>
void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
{
using namespace ck;
static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = TConstTensorDesc{};
os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
<< desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
<< "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
<< desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
}
// this is ugly, only for 4d
template <class TConstTensorDesc>
auto make_TensorDescriptor(TConstTensorDesc)
{
using namespace ck;
static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = TConstTensorDesc{};
std::initializer_list<index_t> lengths = {
desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
std::initializer_list<index_t> strides = {
desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
return TensorDescriptor(lengths, strides);
}
template <class TIn, template <class TIn,
class TWei, class TWei,
...@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread); make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread); make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
} }
template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
#pragma once
#include "tensor.hpp"
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
const Tensor<TOut>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads)
{
using namespace ck;
int N = in_nchw.mDesc.GetLengths()[0];
int C = in_nchw.mDesc.GetLengths()[1];
int HI = in_nchw.mDesc.GetLengths()[2];
int WI = in_nchw.mDesc.GetLengths()[3];
std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
if(h_tmp % ConvStrides{}[0] == 0)
{
int ho = h_tmp / ConvStrides{}[0];
if(ho >= 0 && ho < HO)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
if(w_tmp % ConvStrides{}[1] == 0)
{
int wo = w_tmp / ConvStrides{}[1];
if(wo >= 0 && wo < WO)
{
for(int k = 0; k < K; ++k)
{
v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x);
}
}
}
}
}
}
}
in_nchw(n, c, hi, wi) = v;
};
auto f_par = make_ParallelTensorFunctor(f,
in_nchw.mDesc.GetLengths()[0],
in_nchw.mDesc.GetLengths()[1],
in_nchw.mDesc.GetLengths()[2],
in_nchw.mDesc.GetLengths()[3]);
f_par(std::thread::hardware_concurrency());
}
...@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args) ...@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args)
struct TensorDescriptor struct TensorDescriptor
{ {
TensorDescriptor() = delete; TensorDescriptor() = delete;
TensorDescriptor(std::initializer_list<std::size_t> lens);
TensorDescriptor(std::initializer_list<std::size_t> lens, template <typename X>
std::initializer_list<std::size_t> strides); TensorDescriptor(std::vector<X> lens);
TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
template <typename X, typename Y>
TensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
void CalculateStrides(); void CalculateStrides();
...@@ -269,4 +271,39 @@ struct Tensor ...@@ -269,4 +271,39 @@ struct Tensor
std::vector<T> mData; std::vector<T> mData;
}; };
void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout)
{
os << "dim " << desc.GetNumOfDimension() << ", ";
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.GetStrides(), ", ");
os << "}" << std::endl;
}
template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
#endif #endif
#ifndef TENSOR_GENERATOR_HPP
#define TENSOR_GENERATOR_HPP
#include "config.hpp"
struct GeneratorTensor_1
{
int value = 1;
template <class... Is>
double operator()(Is... is)
{
return value;
}
};
struct GeneratorTensor_2
{
int min_value = 0;
int max_value = 1;
template <class... Is>
double operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
}
};
struct GeneratorTensor_3
{
template <class... Is>
double operator()(Is... is)
{
std::array<ck::index_t, sizeof...(Is)> dims = {{static_cast<ck::index_t>(is)...}};
auto f_acc = [](auto a, auto b) { return 10 * a + b; };
return std::accumulate(dims.begin(), dims.end(), ck::index_t(0), f_acc);
}
};
struct GeneratorTensor_Checkboard
{
template <class... Ts>
double operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{Xs...}};
return std::accumulate(dims.begin(),
dims.end(),
true,
[](bool init, ck::index_t x) -> int { return init != (x % 2); })
? 1
: -1;
}
};
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment