"docs/vscode:/vscode.git/clone" did not exist on "56e2835eb4dc13aadae86ee2de8ecb8415579424"
Commit 961e6810 authored by root's avatar root
Browse files

perform eval

parent 4b456610
......@@ -225,6 +225,33 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
for(index_t j = 0; j < nrepeat; ++j)
{
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_gemmk_gemmm_global_desc),
const Float*,
decltype(in_gemmk_n_ho_wo_global_desc),
const Float*,
decltype(out_gemmm_n_ho_wo_global_desc),
Float*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
wei_gemmk_gemmm_global_desc,
p_wei_global,
in_gemmk_n_ho_wo_global_desc,
p_in_global,
out_gemmm_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
#if 0
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
......@@ -333,6 +360,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
#endif
}
timer.End();
......
......@@ -4,7 +4,7 @@
template <typename GridwiseOp, typename... Xs>
__global__ void
#if 1
__launch_bounds__(64, 2)
__launch_bounds__(256, 2)
#endif
run_gridwise_operation(Xs... xs)
{
......
......@@ -76,90 +76,10 @@ struct ThreadwiseGemm_km_kn_mn_v3
});
}
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto M = CDesc{}.GetLength(I0);
constexpr auto N = CDesc{}.GetLength(I1);
constexpr auto K = ADesc{}.GetLength(I0);
static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) {
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
if constexpr(N == 2)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
amd_assembly_outer_product_1x2(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_c[c_offset_0],
p_c[c_offset_1]);
}
else if constexpr(N == 4)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
}
});
});
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
(is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
(is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
if constexpr(has_amd_asm)
{
Run_amd_asm(p_a, p_b, p_c);
}
else
{
Run_source(p_a, p_b, p_c);
}
#else
Run_source(p_a, p_b, p_c);
#endif
}
};
......
......@@ -68,14 +68,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
#endif
// cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64;
constexpr index_t BlockSize = 256;
constexpr index_t KPerBlock = 16;
constexpr index_t HPerBlock = 16;
constexpr index_t WPerBlock = 16;
constexpr index_t CYXPerBlock = 4;
constexpr index_t KPerThread = 16;
constexpr index_t KPerThread = 4;
constexpr index_t HPerThread = 2;
constexpr index_t WPerThread = 2;
constexpr index_t CYXPerThread = 4;
......
......@@ -631,7 +631,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 0
#if 1
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using out_data_t = float;
......@@ -756,22 +756,6 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t,
in_vector_size,
acc_data_t,
out_data_t>(
in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if(do_verification)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment