"docs/git@developer.sourcefind.cn:OpenDAS/torchani.git" did not exist on "c059adbf671e8b2426f380ba309fa559ab44515d"
Commit 20fa988f authored by Chao Liu's avatar Chao Liu
Browse files

add fwd-v4r4-nhwc, change vector_type

parent 1c62b47b
...@@ -168,6 +168,51 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave, ...@@ -168,6 +168,51 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
#endif #endif
} }
template <>
__device__ float8_t amd_buffer_load_v2<float, 8>(const float* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_data_range)
{
BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
vector_type<float, 8> vector;
vector.Set(Number<4>{}, Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
vector.Set(Number<4>{}, Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data,
src_addr_shift + src_thread_addr_offset + 4 * sizeof(float),
0,
0);
return vector.Get(Number<8>{}, Number<0>{});
#else
vector_type<float, 8> vector;
vector.Set(Number<4>{}, Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0);
vector.Set(Number<4>{}, Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data, src_thread_addr_offset + 4 * sizeof(float), 0, 0);
return src_thread_data_valid ? vector.Get(Number<8>{}, Number<0>{}) : float8_t(0);
#endif
}
template <> template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data, __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave, float* p_dst_wave,
......
...@@ -4,19 +4,20 @@ ...@@ -4,19 +4,20 @@
namespace ck { namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA // For some reason, HIP compiler need this definition to generate optimal ISA
// float // fp32
typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4))); typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float8_t __attribute__((ext_vector_type(8)));
typedef float float16_t __attribute__((ext_vector_type(16))); typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float32_t __attribute__((ext_vector_type(32))); typedef float float32_t __attribute__((ext_vector_type(32)));
// float16 // fp16
typedef _Float16 half_t; typedef _Float16 half_t;
typedef _Float16 half2_t __attribute__((ext_vector_type(2))); typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4))); typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8))); typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
// bfloat16 // bfp16
typedef ushort ushort2_t __attribute__((ext_vector_type(2))); typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4))); typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8))); typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
...@@ -168,23 +169,17 @@ struct c_vec4_1_t ...@@ -168,23 +169,17 @@ struct c_vec4_1_t
} }
}; };
template <class T, index_t N> template <typename T, index_t N>
struct vector_type struct vector_type;
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <> template <typename T>
struct vector_type<float, 1> struct vector_type<T, 1>
{ {
using MemoryType = float; using MemoryType = T;
float data_; T data_;
__host__ __device__ constexpr vector_type() : data_{0} {} __host__ __device__ constexpr vector_type() : data_{T{0}} {}
__host__ __device__ static constexpr index_t Size() { return 1; } __host__ __device__ static constexpr index_t Size() { return 1; }
...@@ -192,6 +187,22 @@ struct vector_type<float, 1> ...@@ -192,6 +187,22 @@ struct vector_type<float, 1>
__host__ __device__ constexpr auto& Vector() { return data_; } __host__ __device__ constexpr auto& Vector() { return data_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_;
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
}
template <index_t I> template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
...@@ -209,31 +220,66 @@ struct vector_type<float, 1> ...@@ -209,31 +220,66 @@ struct vector_type<float, 1>
} }
}; };
template <> template <typename T>
struct vector_type<float, 2> struct vector_type<T, 2>
{ {
using MemoryType = float2_t; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
using MemoryType = d2_t;
union union
{ {
float2_t vector_; d2_t d2_;
StaticallyIndexedArray<float, 2> scalars_; StaticallyIndexedArray<d1_t, 2> d1x2_;
} data_; } data_;
__host__ __device__ constexpr vector_type() : data_{MemoryType{0}} {} __host__ __device__ constexpr vector_type() : data_{d2_t{0}} {}
__host__ __device__ static constexpr index_t Size() { return 2; } __host__ __device__ static constexpr index_t Size() { return 2; }
__host__ __device__ constexpr const auto& Vector() const { return data_.vector_; } __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
__host__ __device__ constexpr auto& Vector() { return data_.d2_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_[i];
}
__host__ __device__ constexpr auto& Vector() { return data_.vector_; } template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<2>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_.d2_;
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_(i);
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_.d2_;
}
template <index_t I> template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I >= 0 && I < 2, "wrong!"); static_assert(I >= 0 && I < 2, "wrong!");
return data_.scalars_[Number<I>{}]; return data_.d1x2_[Number<I>{}];
} }
template <index_t I> template <index_t I>
...@@ -241,219 +287,203 @@ struct vector_type<float, 2> ...@@ -241,219 +287,203 @@ struct vector_type<float, 2>
{ {
static_assert(I >= 0 && I < 2, "wrong!"); static_assert(I >= 0 && I < 2, "wrong!");
return data_.scalars_(Number<I>{}); return data_.d1x2_(Number<I>{});
} }
}; };
template <> template <typename T>
struct vector_type<float, 4> struct vector_type<T, 4>
{ {
using MemoryType = float4_t; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
using MemoryType = d4_t;
union union
{ {
float4_t vector_; d4_t d4_;
StaticallyIndexedArray<float, 4> scalars_; StaticallyIndexedArray<d1_t, 4> d1x4_;
StaticallyIndexedArray<d2_t, 2> d2x2_;
} data_; } data_;
__host__ __device__ constexpr vector_type() : data_{MemoryType{0}} {} __host__ __device__ constexpr vector_type() : data_{d4_t{0}} {}
__host__ __device__ static constexpr index_t Size() { return 4; } __host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.vector_; } __host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }
__host__ __device__ constexpr auto& Vector() { return data_.vector_; } __host__ __device__ constexpr auto& Vector() { return data_.d4_; }
template <index_t I> template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const __host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{ {
static_assert(I >= 0 && I < 4, "wrong!"); static_assert(I >= 0 && I < 4, "wrong!");
return data_.scalars_[Number<I>{}]; return data_.d1x4_[i];
} }
template <index_t I> template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>) __host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
{ {
static_assert(I >= 0 && I < 4, "wrong!"); static_assert(I >= 0 && I < 2, "wrong!");
return data_.scalars_(Number<I>{}); return data_.d2x2_[i];
} }
};
template <>
struct vector_type<half_t, 1>
{
using MemoryType = half_t;
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ constexpr const auto& Get(Number<4>, Number<I>) const
{ {
static_assert(I < 1, "wrong"); static_assert(I == 0, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <>
struct vector_type<half_t, 2>
{
using MemoryType = half2_t;
union DataType return data_.d4_;
{ }
MemoryType vector;
half_t scalar[2];
};
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{ {
static_assert(I < 2, "wrong"); static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
return data_.d1x4_(i);
} }
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1) template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{ {
DataType data; static_assert(I >= 0 && I < 3, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <> return data_.d2x2_(i);
struct vector_type<half_t, 4> }
{
using MemoryType = half4_t;
union DataType template <index_t I>
__host__ __device__ constexpr auto& Set(Number<4>, Number<I>)
{ {
MemoryType vector; static_assert(I == 0, "wrong!");
half_t scalar[4];
}; return data_.d4_;
}
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I < 4, "wrong"); static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
return data_.d1x4_[Number<I>{}];
} }
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3) template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{ {
DataType data; static_assert(I >= 0 && I < 4, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1; return data_.d1x4_(Number<I>{});
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
} }
}; };
template <> template <typename T>
struct vector_type<half_t, 8> struct vector_type<T, 8>
{ {
using MemoryType = half8_t; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d8_t __attribute__((ext_vector_type(8)));
using MemoryType = d8_t;
union DataType union
{ {
MemoryType vector; d8_t d8_;
half_t scalar[8]; StaticallyIndexedArray<d1_t, 8> d1x8_;
}; StaticallyIndexedArray<d2_t, 4> d2x4_;
StaticallyIndexedArray<d4_t, 2> d4x2_;
} data_;
__host__ __device__ constexpr vector_type() : data_{d8_t{0}} {}
__host__ __device__ static constexpr index_t Size() { return 8; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d8_; }
__host__ __device__ constexpr auto& Vector() { return data_.d8_; }
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{ {
static_assert(I < 8, "wrong"); static_assert(I >= 0 && I < 8, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <> return data_.d1x8_[i];
struct vector_type<ushort, 1> }
{
using MemoryType = ushort;
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>) __host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
{ {
static_assert(I < 1, "wrong"); static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
template <> return data_.d2x4_[i];
struct vector_type<ushort, 2> }
{
using MemoryType = ushort2_t;
union DataType template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<4>, Number<I> i) const
{ {
MemoryType vector; static_assert(I >= 0 && I < 2, "wrong!");
ushort scalar[2];
}; return data_.d4x2_[i];
}
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>) __host__ __device__ constexpr const auto& Get(Number<8>, Number<I>) const
{ {
static_assert(I < 2, "wrong"); static_assert(I == 0, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d8_;
} }
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1) template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{ {
DataType data; static_assert(I >= 0 && I < 8, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <> return data_.d1x8_(i);
struct vector_type<ushort, 4> }
{
using MemoryType = ushort4_t;
union DataType template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{ {
MemoryType vector; static_assert(I >= 0 && I < 4, "wrong!");
ushort scalar[4];
}; return data_.d2x4_(i);
}
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>) __host__ __device__ constexpr auto& Set(Number<4>, Number<I> i)
{ {
static_assert(I < 4, "wrong"); static_assert(I >= 0 && I < 2, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d4x2_(i);
} }
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3) template <index_t I>
__host__ __device__ constexpr auto& Set(Number<8>, Number<I> i)
{ {
DataType data; static_assert(I == 0, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <> return data_.d8_;
struct vector_type<ushort, 8> }
{
using MemoryType = ushort8_t;
union DataType template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
MemoryType vector; static_assert(I >= 0 && I < 8, "wrong!");
ushort scalar[8];
}; return data_.d1x8_[Number<I>{}];
}
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>) __host__ __device__ constexpr auto& operator()(Number<I>)
{ {
static_assert(I < 8, "wrong"); static_assert(I >= 0 && I < 8, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d1x8_(Number<I>{});
} }
}; };
......
...@@ -23,6 +23,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -23,6 +23,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
InRightPads, InRightPads,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
<< std::endl;
using namespace ck; using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type; using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
......
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk"
<< std::endl;
using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto N = OutDesc::GetLengths()[I0];
constexpr auto K = OutDesc::GetLengths()[I1];
constexpr auto C = WeiDesc::GetLengths()[I1];
constexpr auto Hi = InDesc::GetLengths()[I2];
constexpr auto Wi = InDesc::GetLengths()[I3];
constexpr auto Ho = OutDesc::GetLengths()[I2];
constexpr auto Wo = OutDesc::GetLengths()[I3];
constexpr auto Y = WeiDesc::GetLengths()[I2];
constexpr auto X = WeiDesc::GetLengths()[I3];
#if 1
// run-time variables
constexpr auto in_n_hi_wi_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C));
constexpr auto wei_k_y_x_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, Y, X, C));
constexpr auto out_n_ho_wo_k_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Ho, Wo, K));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
#else
// compile-time variables
constexpr auto in_n_hi_wi_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, Hi, Wi, C));
constexpr auto wei_k_y_x_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y, X, C));
constexpr auto out_n_ho_wo_k_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, Ho, Wo, K));
const auto conv_strides = sequence_to_tuple_of_number(ConvStrides{});
const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
const auto in_left_pads = sequence_to_tuple_of_number(InLeftPads{});
const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif
Tensor<float> in_nhwc(
make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<N, Hi, Wi, C>{})));
Tensor<float> wei_kyxc(
make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<K, Y, X, C>{})));
Tensor<float> out_nhwk(
make_HostTensorDescriptor(make_native_tensor_descriptor_packed(Sequence<N, Ho, Wo, K>{})));
auto f_nchw2nhwc = [&](auto n, auto hi, auto wi, auto c) {
in_nhwc(n, hi, wi, c) = in_nchw(n, c, hi, wi);
};
auto f_kcyx2kyxc = [&](auto k, auto y, auto x, auto c) {
wei_kyxc(k, y, x, c) = wei_kcyx(k, c, y, x);
};
auto f_nkhw2nhwk = [&](auto n, auto ho, auto wo, auto k) {
out_nhwk(n, ho, wo, k) = out_nkhw(n, k, ho, wo);
};
make_ParallelTensorFunctor(f_nchw2nhwc, N, Hi, Wi, C)(std::thread::hardware_concurrency());
make_ParallelTensorFunctor(f_kcyx2kyxc, K, Y, X, C)(std::thread::hardware_concurrency());
make_ParallelTensorFunctor(f_nkhw2nhwk, N, Ho, Wo, K)(std::thread::hardware_concurrency());
std::size_t data_sz = sizeof(T);
DeviceMem in_nhwc_device_buf(data_sz * in_nhwc.mDesc.GetElementSpace());
DeviceMem wei_kyxc_device_buf(data_sz * wei_kyxc.mDesc.GetElementSpace());
DeviceMem out_nhwk_device_buf(data_sz * out_nhwk.mDesc.GetElementSpace());
in_nhwc_device_buf.ToDevice(in_nhwc.mData.data());
wei_kyxc_device_buf.ToDevice(wei_kyxc.mData.data());
out_nhwk_device_buf.ToDevice(out_nhwk.mData.data());
#if 0
// cdata = 64, BlockSize = 128, 32x256x8
constexpr index_t BlockSize = 128;
constexpr index_t GemmMPerBlock = 32;
constexpr index_t GemmNPerBlock = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 2;
constexpr index_t GemmNLevel1Cluster = 16;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<2, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<8, 2>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0
// cdata = 64, BlockSize = 256, 128x128x8
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x16
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 16;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 2>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 64>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 2;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<8, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK = 8;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
#endif
constexpr auto conv_driver =
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_no_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
#endif
<BlockSize,
TDevice,
TDevice,
GemmMPerBlock,
GemmNPerBlock,
GemmKPerBlock,
GemmMPerThread,
GemmNPerThread,
GemmKPerThread,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
GemmABlockTransferSrcScalarPerVector_GemmK,
GemmABlockTransferDstScalarPerVector_GemmM,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
GemmBBlockTransferSrcScalarPerVector_GemmK,
GemmBBlockTransferDstScalarPerVector_GemmN,
GemmCThreadTransferDstScalarPerVector_GemmM1>{};
conv_driver.Run(wei_k_y_x_c_desc,
in_n_hi_wi_c_desc,
out_n_ho_wo_k_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
static_cast<TDevice*>(wei_kyxc_device_buf.GetDeviceBuffer()),
static_cast<TDevice*>(in_nhwc_device_buf.GetDeviceBuffer()),
static_cast<TDevice*>(out_nhwk_device_buf.GetDeviceBuffer()));
out_nhwk_device_buf.FromDevice(out_nhwk.mData.data());
auto f_nhwk2nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_nkhw(n, k, ho, wo) = out_nhwk(n, ho, wo, k);
};
make_ParallelTensorFunctor(f_nhwk2nkhw, N, K, Ho, Wo)(std::thread::hardware_concurrency());
}
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
...@@ -615,7 +616,7 @@ int main(int argc, char* argv[]) ...@@ -615,7 +616,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -627,6 +628,18 @@ int main(int argc, char* argv[]) ...@@ -627,6 +628,18 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif #endif
if(do_verification) if(do_verification)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment