"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "d754ce5f3b9d012131f147bb5ddc261402b62adf"
Commit 33d1e0e2 authored by Chao Liu's avatar Chao Liu
Browse files

refactoring for miopen

parent b1cb48a0
...@@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#if 1 #if 1
blockwise_batch_gemm.Run blockwise_batch_gemm.Run
#else #else
blockwise_batch_gemm.Run_asm blockwise_batch_gemm.Run_amd_asm
#endif #endif
(p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
......
...@@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#if 0 #if 0
return blockwise_batch_gemm.Run(Xs...); return blockwise_batch_gemm.Run(Xs...);
#elif 0 #elif 0
return blockwise_batch_gemm.Run_asm(Xs...); return blockwise_batch_gemm.Run_amd_asm(Xs...);
#else #else
return blockwise_batch_gemm.Run_asm_v2(Xs...); return blockwise_batch_gemm.Run_asm_v2(Xs...);
#endif #endif
......
...@@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer ...@@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#if 1 #if 1
return blockwise_batch_gemm.Run(Xs...); return blockwise_batch_gemm.Run(Xs...);
#elif 0 #elif 0
return blockwise_batch_gemm.Run_asm(Xs...); return blockwise_batch_gemm.Run_amd_asm(Xs...);
#else #else
return blockwise_batch_gemm.Run_asm_v2(Xs...); return blockwise_batch_gemm.Run_asm_v2(Xs...);
#endif #endif
......
...@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#if 1 #if 1
return blockwise_batch_gemm.Run(Xs...); return blockwise_batch_gemm.Run(Xs...);
#elif 0 #elif 0
return blockwise_batch_gemm.Run_asm(Xs...); return blockwise_batch_gemm.Run_amd_asm(Xs...);
#else #else
return blockwise_batch_gemm.Run_asm_v2(Xs...); return blockwise_batch_gemm.Run_asm_v2(Xs...);
#endif #endif
......
...@@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer ...@@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
#if 1 #if 1
return blockwise_batch_gemm.Run(Xs...); return blockwise_batch_gemm.Run(Xs...);
#elif 0 #elif 0
return blockwise_batch_gemm.Run_asm(Xs...); return blockwise_batch_gemm.Run_amd_asm(Xs...);
#else #else
return blockwise_batch_gemm.Run_asm_v2(Xs...); return blockwise_batch_gemm.Run_asm_v2(Xs...);
#endif #endif
......
...@@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_amd_asm
#endif #endif
(p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_in_block + y * Wi + x, p_in_block + y * Wi + x,
......
...@@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_amd_asm
#endif #endif
(p_wei_block_now + (p_wei_block_now +
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
...@@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_amd_asm
#endif #endif
(p_wei_block_double + (p_wei_block_double +
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
...@@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_amd_asm
#endif #endif
(p_wei_block_double + wei_block_space + (p_wei_block_double + wei_block_space +
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
......
...@@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#if 1 #if 1
return blockwise_gemm.Run(Xs...); return blockwise_gemm.Run(Xs...);
#else #else
return blockwise_gemm.Run_asm(Xs...); return blockwise_gemm.Run_amd_asm(Xs...);
#endif #endif
}; };
......
...@@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer ...@@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
#if 1 #if 1
return blockwise_gemm.Run(Xs...); return blockwise_gemm.Run(Xs...);
#else #else
return blockwise_gemm.Run_asm(Xs...); return blockwise_gemm.Run_amd_asm(Xs...);
#endif #endif
}; };
......
...@@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
#if 1 #if 1
return blockwise_gemm.Run(Xs...); return blockwise_gemm.Run(Xs...);
#else #else
return blockwise_gemm.Run_asm(Xs...); return blockwise_gemm.Run_amd_asm(Xs...);
#endif #endif
}; };
......
...@@ -9,10 +9,6 @@ ...@@ -9,10 +9,6 @@
#include "blockwise_gemm.hpp" #include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace ck { namespace ck {
// define B = merge(N0, Ho, Wo) // define B = merge(N0, Ho, Wo)
...@@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB>{}; GemmDataPerReadB>{};
// choose GEMM implementation here
const auto run_blockwise_gemm = [&](auto... Xs) {
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
return blockwise_gemm.Run_asm(Xs...);
#else
return blockwise_gemm.Run(Xs...);
#endif
};
// LDS allocation for input and weight: be careful of alignment // LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDstDataPerWrite_K, WeiBlockCopyDstDataPerWrite_K,
...@@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
GemmDataPerReadB); GemmDataPerReadB);
constexpr index_t in_block_space = constexpr index_t in_block_space =
in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{}); math::integer_divide_ceil(in_e_n1_b_n2_block_desc.GetElementSpace(), max_align) *
max_align;
constexpr index_t wei_block_space = wei_e_k_block_desc.GetElementSpace(Number<max_align>{}); constexpr index_t wei_block_space =
math::integer_divide_ceil(wei_e_k_block_desc.GetElementSpace(), max_align) * max_align;
__shared__ Float p_in_block_double[2 * in_block_space]; __shared__ Float p_in_block_double[2 * in_block_space];
__shared__ Float p_wei_block_double[2 * wei_block_space]; __shared__ Float p_wei_block_double[2 * wei_block_space];
...@@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
p_wei_register_clipboard); p_wei_register_clipboard);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread); blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
...@@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
p_wei_register_clipboard); p_wei_register_clipboard);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread); blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
...@@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads(); __syncthreads();
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
run_blockwise_gemm(p_wei_block_double + wei_block_space, blockwise_gemm.Run(p_wei_block_double + wei_block_space,
p_in_block_double + in_block_space, p_in_block_double + in_block_space,
p_out_thread); p_out_thread);
} }
......
...@@ -107,15 +107,12 @@ struct ConstantTensorDescriptor ...@@ -107,15 +107,12 @@ struct ConstantTensorDescriptor
return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
} }
template <class Align = Number<1>> __host__ __device__ static constexpr index_t GetElementSpace()
__host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
{ {
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return element_space_unaligned;
} }
// emulate constexpr lambda // emulate constexpr lambda
...@@ -234,7 +231,7 @@ struct ConstantTensorDescriptor ...@@ -234,7 +231,7 @@ struct ConstantTensorDescriptor
// do carry check in reversed order, starting from lowest dimension // do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension // don't check the highest dimension
static_for<0, nDim, 1>{}([&](auto IDimReverse) { static_for<0, nDim, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get(); constexpr index_t idim = nDim - 1 - IDimReverse;
constexpr auto IDim = Number<idim>{}; constexpr auto IDim = Number<idim>{};
if(carry) if(carry)
...@@ -260,7 +257,7 @@ struct ConstantTensorDescriptor ...@@ -260,7 +257,7 @@ struct ConstantTensorDescriptor
// do borrow check in reversed order, starting from lowest dimension // do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension // don't check the highest dimension
static_for<0, nDim, 1>{}([&](auto IDimReverse) { static_for<0, nDim, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get(); constexpr index_t idim = nDim - 1 - IDimReverse;
constexpr auto IDim = Number<idim>{}; constexpr auto IDim = Number<idim>{};
if(borrow) if(borrow)
......
...@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2 ...@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
using Float4 = float4; using Float4 = float4;
using Float2 = float2; using Float2 = float2;
...@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3 ...@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3
__device__ constexpr index_t GetRegisterClipboardSize() const __device__ constexpr index_t GetRegisterClipboardSize() const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3 ...@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3
*(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset + *(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
iloop * src_loop_stride])); iloop * src_loop_stride]));
#else #else
static_assert(is_same<float, Float>::value && DataPerRead == 4, static_assert(is_same<float, Float>{} && DataPerRead == 4,
"global_load is only for float4"); "global_load is only for float4");
global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]), global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
...@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3 ...@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) = *(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]); *(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
#else #else
static_assert(is_same<float, Float>::value && DataPerRead == 4, static_assert(is_same<float, Float>{} && DataPerRead == 4,
"ds_write_b128 is only for float4"); "ds_write_b128 is only for float4");
ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]), ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),
......
...@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3 ...@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterClipboardSize()
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3 ...@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3
__device__ constexpr index_t GetRegisterClipboardSize() const __device__ constexpr index_t GetRegisterClipboardSize() const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
template <class FloatA, class FloatB, class FloatC> template <class FloatA, class FloatB, class FloatC>
__device__ void Run_asm(const FloatA* __restrict__ p_a_block, __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block, const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const FloatC* __restrict__ p_c_thread) const
{ {
constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto b_block_mtx = BlockMatrixB{};
...@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
// assertion for inline asm // assertion for inline asm
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value && static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
is_same<FloatC, float>::value, is_same<FloatC, float>{},
"Run_asm only deal with float\n"); "Run_amd_asm only deal with float\n");
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8, MPerThread == 8 && NPerThread == 8,
"Run_asm cannot deal with this GEMM shape yet\n"); "Run_amd_asm cannot deal with this GEMM shape yet\n");
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
static_assert( static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
BlockMatrixStrideA == 0 && BatchPerThread == 1, "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n"); "1 for now\n");
using Float4 = vector_type<float, 4>::MemoryType; using Float4 = vector_type<float, 4>::MemoryType;
...@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
// assertion for inline asm // assertion for inline asm
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value && static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
is_same<FloatC, float>::value, is_same<FloatC, float>{},
"Run_asm only deal with float\n"); "Run_amd_asm only deal with float\n");
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8, MPerThread == 8 && NPerThread == 8,
"Run_asm cannot deal with this GEMM shape yet\n"); "Run_amd_asm cannot deal with this GEMM shape yet\n");
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
static_assert( static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
BlockMatrixStrideA == 0 && BatchPerThread == 1, "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n"); "1 for now\n");
using Float4 = vector_type<float, 4>::MemoryType; using Float4 = vector_type<float, 4>::MemoryType;
......
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
#include "threadwise_gemm.hpp" #include "threadwise_gemm.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace ck { namespace ck {
// if following number are power of 2, index calculation shall be greatly reduced: // if following number are power of 2, index calculation shall be greatly reduced:
...@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong! Cannot evenly divide work among\n"); "wrong! Cannot evenly divide work among\n");
static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()), static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
decltype(GetThreadMatrixCLengths())>{},
"wrong! ThreadMatrixC lengths is wrong"); "wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...@@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
} }
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
// TODO: this is not working correctly
template <class FloatA, class FloatB, class FloatC> template <class FloatA, class FloatB, class FloatC>
__device__ void Run_asm(const FloatA* __restrict__ p_a_block, __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block, const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const FloatC* __restrict__ p_c_thread) const
{ {
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
// assertion for inline asm // assertion for inline asm
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value && static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
is_same<FloatC, float>::value, is_same<FloatC, float>{},
"Run_asm only deal with float\n"); "Run_amd_asm only deal with float");
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8, MPerThread == 8 && NPerThread == 8,
"Run_asm cannot deal with this GEMM shape yet\n"); "Run_amd_asm cannot deal with this GEMM shape yet");
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read");
using Float4 = vector_type<float, 4>::MemoryType; using Float4 = vector_type<float, 4>::MemoryType;
...@@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
#endif #endif
template <class FloatA, class FloatB, class FloatC> template <class FloatA, class FloatB, class FloatC>
__device__ void Run(const FloatA* const __restrict__ p_a_block, __device__ void Run_source(const FloatA* const __restrict__ p_a_block,
const FloatB* const __restrict__ p_b_block, const FloatB* const __restrict__ p_b_block,
FloatC* const __restrict__ p_c_thread) const FloatC* const __restrict__ p_c_thread) const
{ {
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
} }
template <class FloatA, class FloatB, class FloatC> template <class FloatA, class FloatB, class FloatC>
__device__ void Run_RegisterDoubleBuffer(FloatA* const p_a_block, __device__ void RunRegisterDoubleBuffer_source(FloatA* const p_a_block,
FloatB* const p_b_block, FloatB* const p_b_block,
FloatC* p_c_thread) const FloatC* p_c_thread) const
{ {
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_c_thread); p_c_thread);
} }
} }
template <class FloatA, class FloatB, class FloatC>
__device__ void Run(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const
{
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
Run_amd_asm(p_a_block, p_b_block, p_c_thread);
#else
Run_source(p_a_block, p_b_block, p_c_thread);
#endif
}
}; };
} // namespace ck } // namespace ck
......
...@@ -6,6 +6,10 @@ ...@@ -6,6 +6,10 @@
#include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck { namespace ck {
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
...@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// for now, only support SubLengths.Get() == 1 on a merged dimension that constains // for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions // multiple original dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) { static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){}; constexpr auto IDim = decltype(IDim_){};
...@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// partial offset on each dimension // partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim_) { static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){}; constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim;
constexpr auto src_partial_original_dims = constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim); SrcDesc::GetContainedOriginalDimensions(IDim);
...@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
static_for<0, nDim, 1>{}([&](auto IDim_) { static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){}; constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim;
constexpr auto dst_partial_original_dims = constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim); DstDesc::GetContainedOriginalDimensions(IDim);
...@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstOffset = accumulate_on_array( mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0)); mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
#if 0
if(get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, "
"thread_cluster_multi_id: %u %u %u %u, "
"data_cluster_multi_id: %u %u %u %u, "
"thread_data_multi_id_begin: %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
src_block_data_multi_id_begin[0],
src_block_data_multi_id_begin[1],
src_block_data_multi_id_begin[2],
src_block_data_multi_id_begin[3],
thread_cluster_multi_id[0],
thread_cluster_multi_id[1],
thread_cluster_multi_id[2],
thread_cluster_multi_id[3],
data_cluster_multi_id[0],
data_cluster_multi_id[1],
data_cluster_multi_id[2],
data_cluster_multi_id[3],
thread_data_multi_id_begin[0],
thread_data_multi_id_begin[1],
thread_data_multi_id_begin[2],
thread_data_multi_id_begin[3],
mThreadSrcOffset,
mThreadDstOffset);
}
#endif
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterClipboardSize()
...@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if 0 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else // HIP compiler performs better with these codes
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_thread_data_multi_id_begin = constexpr auto src_thread_data_multi_id_begin =
...@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr index_t clipboard_offset = constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#endif #endif
threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
...@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if 0 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else // HIP compiler performs better with these codes
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id_begin = constexpr auto clipboard_data_multi_id_begin =
...@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr index_t dst_offset = constexpr index_t dst_offset =
DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif #endif
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
...@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
} }
// When moving the slicing windows along a merged dimension, if the strides of the // When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are in descending order, // contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset // then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction). // for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in // As a result, there is the possiblity that the offset calculation may result in
...@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction) Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{ {
constexpr auto IDim = Number<IDim_>{}; constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim;
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) { static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may // logic for a merged dimension, also works for non-merged dimension, but its logic may
...@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto I = decltype(I_){}; constexpr auto I = decltype(I_){};
constexpr index_t idim_original = src_partial_original_dims.Get(I); constexpr index_t idim_original = src_partial_original_dims.Get(I);
mThreadSrcOriginalMultiId(idim_original) = mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
new_src_partial_original_multi_id[I.Get()];
}); });
// calculate new partial offset on this merged dimension // calculate new partial offset on this merged dimension
......
...@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
make_ConstantTensorDescriptor_packed(thread_cluster_lengths); make_ConstantTensorDescriptor_packed(thread_cluster_lengths);
// sanity check: data type // sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float for now!\n");
// sanity check: nDim // sanity check: nDim
static_assert(SrcDesc::GetNumOfDimension() == nDim && static_assert(SrcDesc::GetNumOfDimension() == nDim &&
...@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3
reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster); reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
static_for<0, nDim, 1>{}([&](auto IDim) { static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto I = decltype(IDim){}; constexpr index_t idim = IDim;
constexpr index_t i = I.Get();
// compiler: will it really compute index here, or be merged with // compiler: will it really compute index here, or be merged with
// GetOffsetFromMultiIndex and // GetOffsetFromMultiIndex and
// optimized away??? // optimized away???
src_data_multi_id(i) *= src_sub_lengths.Get(I); src_data_multi_id(idim) *= src_sub_lengths.Get(IDim);
}); });
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
......
...@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi ...@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
constexpr index_t nshift = NShift::mValue; constexpr index_t nshift = NShift::mValue;
constexpr index_t did0_end = constexpr index_t did0_end =
is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0); is_same<decltype(I0), IDim>{} ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
constexpr index_t did1_end = constexpr index_t did1_end =
is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1); is_same<decltype(I1), IDim>{} ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
constexpr index_t did2_end = constexpr index_t did2_end =
is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2); is_same<decltype(I2), IDim>{} ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
constexpr index_t did3_end = constexpr index_t did3_end =
is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3); is_same<decltype(I3), IDim>{} ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
for(index_t did0 = 0; did0 < did0_end; ++did0) for(index_t did0 = 0; did0 < did0_end; ++did0)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment