Commit 3dd0cc31 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge

parents c21521a1 198593d5
...@@ -181,7 +181,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -181,7 +181,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
#if 1
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize, BlockSize,
...@@ -372,7 +371,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -372,7 +371,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl; << std::endl;
} }
#endif
} }
}; };
} // namespace ck } // namespace ck
......
...@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto EPerBlock = a_block_mtx.GetLength(I0); constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
constexpr auto KPerThreadSubC = 4; constexpr auto KPerThreadSubC = 4;
constexpr auto HPerThreadSubC = 2;
constexpr auto WPerThreadSubC = 2; constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
static_assert(KPerThread % KPerThreadSubC == 0, ""); static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HPerThreadSubC == 0, ""); static_assert(HPerThread % HoPerThreadSubC == 0, "");
static_assert(WPerThread % WPerThreadSubC == 0, ""); static_assert(WPerThread % WoPerThreadSubC == 0, "");
// thread A, B for GEMM // thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
...@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx), constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
decltype(b_thread_mtx), decltype(b_thread_mtx),
decltype(c_thread_mtx), decltype(c_thread_mtx),
HPerThreadSubC, HoPerThreadSubC,
WPerThreadSubC>{}; WoPerThreadSubC>{};
// loop over k // loop over k
#pragma unroll #pragma unroll
for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop) for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
...@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
p_a_thread); p_a_thread);
#pragma unroll #pragma unroll
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HPerThreadSubC) for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
{ {
#pragma unroll #pragma unroll
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WPerThreadSubC) for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
{ {
threadwise_gemm.Run(p_a_thread, threadwise_gemm.Run(p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple( p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
......
...@@ -11,6 +11,47 @@ ...@@ -11,6 +11,47 @@
namespace ck { namespace ck {
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by __CONSTANT__ void pointer
// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template <typename GridwiseGemm,
typename AGlobalDesc,
typename FloatA,
typename BGlobalDesc,
typename FloatB,
typename CGlobalDesc,
typename FloatC,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void run_gridwise_dynamic_gemm_v1(const void __CONSTANT__* p_a_k_m_global_desc,
const FloatA* __restrict__ p_a_global,
const void __CONSTANT__* p_b_k_n_global_desc,
const FloatB* __restrict__ p_b_global,
const void __CONSTANT__* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global)
{
// first cast void __CONSTANT__* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const auto a_k_m_global_desc =
*reinterpret_cast<const AGlobalDesc*>((const void*)p_a_k_m_global_desc);
const auto b_k_n_global_desc =
*reinterpret_cast<const BGlobalDesc*>((const void*)p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>((const void*)p_c_m0_m1_n0_n1_global_desc);
GridwiseGemm{}.Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#endif
template <index_t BlockSize, template <index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
...@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1 ...@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
} }
} }
// pass tensor descriptor by reference
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc& a_k_m_global_desc, __device__ void Run(const AGlobalDesc& a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_a_global,
...@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1 ...@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
integral_constant<bool, HasMainKBlockLoop>{}, integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{}); integral_constant<bool, HasDoubleTailKBlockLoop>{});
} }
// pass tensor descriptors by pointers
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const BGlobalDesc* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *p_a_k_m_global_desc;
const auto b_k_n_global_desc = *p_b_k_n_global_desc;
const auto c_m0_m1_n0_n1_global_desc = *p_c_m0_m1_n0_n1_global_desc;
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by void*
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const void* p_a_k_m_global_desc,
const FloatAB* __restrict__ p_a_global,
const void* p_b_k_n_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_m0_m1_n0_n1_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_k_m_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_k_m_global_desc);
const auto b_k_n_global_desc = *reinterpret_cast<const BGlobalDesc*>(p_b_k_n_global_desc);
const auto c_m0_m1_n0_n1_global_desc =
*reinterpret_cast<const CGlobalDesc*>(p_c_m0_m1_n0_n1_global_desc);
Run(a_k_m_global_desc,
p_a_global,
b_k_n_global_desc,
p_b_global,
c_m0_m1_n0_n1_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
}; };
} // namespace ck } // namespace ck
......
...@@ -272,21 +272,28 @@ __device__ void amd_assembly_outer_product_1x4(int8x8_t a, ...@@ -272,21 +272,28 @@ __device__ void amd_assembly_outer_product_1x4(int8x8_t a,
int32_t& c2, int32_t& c2,
int32_t& c3) int32_t& c3)
{ {
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<0>{}],
b0.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_a_int8x4_t = reinterpret_cast<const int8x4_t*>(&a);
b1.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b0_int8x4_t = reinterpret_cast<const int8x4_t*>(&b0);
b2.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b1_int8x4_t = reinterpret_cast<const int8x4_t*>(&b1);
b3.Vectors(Number<4>{})[Number<0>{}], const int8x4_t* p_b2_int8x4_t = reinterpret_cast<const int8x4_t*>(&b2);
const int8x4_t* p_b3_int8x4_t = reinterpret_cast<const int8x4_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x4_t[0],
p_b0_int8x4_t[0],
p_b1_int8x4_t[0],
p_b2_int8x4_t[0],
p_b3_int8x4_t[0],
c0, c0,
c1, c1,
c2, c2,
c3); c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<4>{})[Number<1>{}], amd_assembly_outer_product_1x4(p_a_int8x4_t[1],
b0.Vectors(Number<4>{})[Number<1>{}], p_b0_int8x4_t[1],
b1.Vectors(Number<4>{})[Number<1>{}], p_b1_int8x4_t[1],
b2.Vectors(Number<4>{})[Number<1>{}], p_b2_int8x4_t[1],
b3.Vectors(Number<4>{})[Number<1>{}], p_b3_int8x4_t[1],
c0, c0,
c1, c1,
c2, c2,
...@@ -302,22 +309,30 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, ...@@ -302,22 +309,30 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
int32_t& c1, int32_t& c1,
int32_t& c2, int32_t& c2,
int32_t& c3) int32_t& c3)
{ {
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<0>{}],
b0.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_a_int8x8_t = reinterpret_cast<const int8x8_t*>(&a);
b1.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b0_int8x8_t = reinterpret_cast<const int8x8_t*>(&b0);
b2.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b1_int8x8_t = reinterpret_cast<const int8x8_t*>(&b1);
b3.Vectors(Number<8>{})[Number<0>{}], const int8x8_t* p_b2_int8x8_t = reinterpret_cast<const int8x8_t*>(&b2);
const int8x8_t* p_b3_int8x8_t = reinterpret_cast<const int8x8_t*>(&b3);
amd_assembly_outer_product_1x4(p_a_int8x8_t[0],
p_b0_int8x8_t[0],
p_b1_int8x8_t[0],
p_b2_int8x8_t[0],
p_b3_int8x8_t[0],
c0, c0,
c1, c1,
c2, c2,
c3); c3);
amd_assembly_outer_product_1x4(a.Vectors(Number<8>{})[Number<1>{}], amd_assembly_outer_product_1x4(p_a_int8x8_t[1],
b0.Vectors(Number<8>{})[Number<1>{}], p_b0_int8x8_t[1],
b1.Vectors(Number<8>{})[Number<1>{}], p_b1_int8x8_t[1],
b2.Vectors(Number<8>{})[Number<1>{}], p_b2_int8x8_t[1],
b3.Vectors(Number<8>{})[Number<1>{}], p_b3_int8x8_t[1],
c0, c0,
c1, c1,
c2, c2,
......
...@@ -7,13 +7,20 @@ ...@@ -7,13 +7,20 @@
#endif #endif
#include "bfloat16_dev.hpp" #include "bfloat16_dev.hpp"
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))
// device backend // device backend
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#define CK_AMD_GPU_GFX906 0 #if 0
#define CK_AMD_GPU_GFX908 0 #define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 1
#endif
// HIP version // HIP version
#ifndef CK_HIP_VERSION_FLAT #ifndef CK_HIP_VERSION_FLAT
...@@ -29,9 +36,9 @@ ...@@ -29,9 +36,9 @@
#endif #endif
// buffer resourse // buffer resourse
#if CK_AMD_GPU_GFX906 || CK_AMD_GPU_GFX908 #if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif CK_AMD_GPU_GFX1030 #elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif #endif
...@@ -104,9 +111,8 @@ ...@@ -104,9 +111,8 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif #endif
// pass tensor descriptor by value, pointer or void* // pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
......
...@@ -224,7 +224,6 @@ struct vector_type<T, 16> ...@@ -224,7 +224,6 @@ struct vector_type<T, 16>
__host__ __device__ constexpr auto& Vectors(Number<16>) { return data_.d16x1_; } __host__ __device__ constexpr auto& Vectors(Number<16>) { return data_.d16x1_; }
}; };
// fp32 // fp32
using float2_t = typename vector_type<float, 2>::type; using float2_t = typename vector_type<float, 2>::type;
using float4_t = typename vector_type<float, 4>::type; using float4_t = typename vector_type<float, 4>::type;
...@@ -439,8 +438,8 @@ struct vector_type<int8_t, 16> ...@@ -439,8 +438,8 @@ struct vector_type<int8_t, 16>
// hack for int8x4_t, because compiler does not have native support for int8x4_t // hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t // int8x4_t is defined as int32_t
using int8x4_t = typename vector_type<int8_t, 4>::type; using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x8_t = vector_type<int8_t, 8>; using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x16_t = vector_type<int8_t, 16>; using int8x16_t = typename vector_type<int8_t, 16>::type;
// data type conversion // data type conversion
template <typename T> template <typename T>
......
...@@ -175,26 +175,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -175,26 +175,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif #endif
constexpr auto conv_driver = constexpr auto conv_driver =
// DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad< #if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad< DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
#endif
BlockSize, BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type, typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock,
TAcc, HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread,
TOut, EPerThread, ABlockTransferThreadSliceLengths_E_K,
KPerBlock, ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E,
HoPerBlock, ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W,
WoPerBlock, CThreadTransferDstScalarPerVector_W > {};
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
conv_driver.Run(wei_k_c0_y_x_desc, conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc, in_n_c0_hi_wi_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment