Commit 88b77181 authored by Chao Liu's avatar Chao Liu
Browse files

rename files, added header guard, added namespace

parent 05e04665
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_tensor_slice_op.hpp"
#include "threadwise_tensor_slice_op.hpp"
#include "threadwise_4d_tensor_op.hpp"
#include "blockwise_tensor_slice_copy.hpp"
#include "threadwise_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_op.hpp"
#include "blockwise_batched_gemm.hpp"
namespace ck {
template <index_t GridSize,
index_t BlockSize,
class Float,
......@@ -78,10 +82,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
"wrong! cannot evenly divide work for workgroup ");
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
......@@ -103,7 +107,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -119,7 +123,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......@@ -230,7 +234,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#endif
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
#if 0
const Float* p_in_global_block_offset =
......@@ -436,8 +440,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
wo_block_data_begin + wo_thread_data_begin),
make_zero_array<index_t, 10>(),
out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread),
arithmetic_sequence_gen<0, 10, 1>::SeqType{});
arithmetic_sequence_gen<0, 10, 1>::SeqType{},
Number<1>{});
#endif
});
}
};
} // namespace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_tensor_slice_op.hpp"
#include "threadwise_tensor_slice_op.hpp"
#include "threadwise_4d_tensor_op.hpp"
#include "blockwise_tensor_slice_copy.hpp"
#include "threadwise_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_op.hpp"
#include "blockwise_batched_gemm.hpp"
namespace ck {
template <index_t GridSize,
index_t BlockSize,
class Float,
......@@ -40,7 +44,7 @@ template <index_t GridSize,
class WeiBlockCopyClusterLengths_CK, // not used
index_t WeiBlockCopyDataPerRead_K,
index_t OutThreadCopyDataPerWrite_W>
struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
{
__device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
......@@ -81,10 +85,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
"wrong! cannot evenly divide work for workgroup ");
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
......@@ -105,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -121,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......@@ -233,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
#endif
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
for(index_t y = 0; y < Y; ++y)
{
......@@ -487,8 +491,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
wo_block_data_begin + wo_thread_data_begin),
make_zero_array<index_t, 10>(),
out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread),
arithmetic_sequence_gen<0, 10, 1>::SeqType{});
arithmetic_sequence_gen<0, 10, 1>::SeqType{},
Number<1>{});
#endif
});
}
};
} // namespace
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
......@@ -6,6 +8,8 @@
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
// define B = flatten(N, Hi, Wi)
template <index_t GridSize,
index_t BlockSize,
......@@ -181,7 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// LDS: be careful of alignment
constexpr index_t max_align =
mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
math::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
......@@ -275,3 +279,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
}
}
};
} // namespace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "threadwise_tensor_slice_op.hpp"
#include "threadwise_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
// define B = flatten(N, Hi, Wi)
template <index_t GridSize,
index_t BlockSize,
......@@ -185,7 +189,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// LDS: be careful of alignment
constexpr index_t max_align =
mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
math::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
......@@ -404,3 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
};
} // namespace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_op.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
// define B = merge(N0, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
......@@ -146,7 +150,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// be careful of LDS alignment
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
......@@ -218,7 +222,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
};
// LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDataPerAccess_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -368,3 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
}
}
};
} // namespace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_op.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
// define B = merge(N0, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
......@@ -34,7 +38,7 @@ template <index_t GridSize,
class WeiBlockCopySubLengths_C_K,
class WeiBlockCopyClusterLengths_C_K,
index_t WeiBlockCopyDataPerAccess_K>
struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
{
__device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
......@@ -143,7 +147,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// be careful of LDS alignment
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
......@@ -215,7 +219,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
};
// LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDataPerAccess_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -395,3 +399,6 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
}
}
};
} // namesspace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_op.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_op.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
namespace ck {
// define B = merge(N0, Ho, Wo)
template <index_t GridSize,
......@@ -176,7 +180,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// be careful of LDS alignment
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
......@@ -248,7 +252,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
};
// LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDstDataPerWrite_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -345,3 +349,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
}
}
};
} // namespace ck
#endif
#pragma once
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_op.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_op.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
namespace ck {
// define B = merge(N0, Ho, Wo)
template <index_t GridSize,
......@@ -42,7 +46,7 @@ template <index_t GridSize,
class WeiBlockCopyDstAccessOrder,
index_t WeiBlockCopySrcDataPerRead_E,
index_t WeiBlockCopyDstDataPerWrite_K>
struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
{
__device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
......@@ -165,7 +169,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// be careful of LDS alignment
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, KPerBlock>{},
Number<mod_conv::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
......@@ -237,7 +241,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
};
// LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDstDataPerWrite_K,
GemmDataPerReadA,
GemmDataPerReadB);
......@@ -410,3 +414,6 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
}
}
};
} // namespace ck
#endif
#ifndef CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER
#define CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER
namespace ck {
template <class GridwiseConvolution, class T>
__global__ void run_gridwise_convolution_kernel(const T* const __restrict__ p_in_global,
const T* const __restrict__ p_wei_global,
T* const __restrict__ p_out_global)
{
GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
}
} // namespace ck
#endif
#pragma once
template <class GridwiseConvolution, class T>
__global__ void run_gridwise_convolution(const T* const __restrict__ p_in_global,
const T* const __restrict__ p_wei_global,
T* const __restrict__ p_out_global)
{
GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
}
......@@ -7,6 +7,8 @@
#include "threadwise_4d_tensor_op.hpp"
#include "threadwise_direct_convolution.hpp"
namespace ck {
template <class TInWei,
class TOut,
class TAccum,
......@@ -253,3 +255,5 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths());
}
} // namespace ck
......@@ -7,6 +7,8 @@
#include "threadwise_4d_tensor_op.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
template <index_t GridSize,
index_t BlockSize,
class Float,
......@@ -292,3 +294,5 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}
} // namespace ck
#pragma once
#ifndef CK_INTEGRAL_CONSTANT_HPP
#define CK_INTEGRAL_CONSTANT_HPP
namespace ck {
template <class T, T N>
struct integral_constant
......@@ -16,3 +19,6 @@ __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_c
template <index_t N>
using Number = integral_constant<index_t, N>;
} // namespace ck
#endif
#pragma once
#ifndef CK_TENSOR_HPP
#define CK_TENSOR_HPP
#include <thread>
#include <vector>
#include <numeric>
......@@ -266,3 +268,5 @@ struct Tensor
TensorDescriptor mDesc;
std::vector<T> mData;
};
#endif
#pragma once
#ifndef CK_THREADWISE_4D_TENSOR_OP_HPP
#define CK_THREADWISE_4D_TENSOR_OP_HPP
#include "ConstantTensorDescriptor.hpp"
namespace ck {
template <class Float, class Desc, class IDim, class NShift>
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
{
......@@ -50,3 +54,6 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
}
}
}
} // namespace ck
#endif
#pragma once
#ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP
#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
#include "ConstantTensorDescriptor.hpp"
#include "threadwise_tensor_slice_op.hpp"
#include "threadwise_tensor_slice_copy.hpp"
namespace ck {
// optimized for scenario if p_in, p_wei, p_out are in register
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
......@@ -218,3 +222,6 @@ __device__ void threadwise_direct_convolution_3(InDesc,
}
#endif
}
} // namespace ck
#endif
#pragma once
#ifndef CK_THREADWISE_GEMM_HPP
#define CK_THREADWISE_GEMM_HPP
#include "common.hpp"
#include "ConstantMatrixDescriptor.hpp"
namespace ck {
template <class Float, class Matrix>
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
{
......@@ -114,3 +118,6 @@ __device__ void threadwise_gemm(MatrixA,
assert(false);
}
}
} // namespace ck
#endif
#ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace ck {
template <class Float, class TDesc>
__device__ void threadwise_generic_tensor_set_zero(TDesc, Float* __restrict__ p)
{
static_ford<decltype(TDesc::GetLengths())>{}([&](auto multi_id) {
constexpr index_t offset = TDesc::GetOffsetFromMultiIndex(multi_id);
p[offset] = static_cast<Float>(0);
});
}
} // namespace ck
#endif
#pragma once
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace ck {
template <class Float,
class SrcDesc,
class DstDesc,
......@@ -97,3 +101,6 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
});
#endif
}
} // namespace ck
#endif
#pragma once
#ifndef CK_THREADWISE_TENSOR_SLICE_COPY_HPP
#define CK_THREADWISE_TENSOR_SLICE_COPY_HPP
#include "ConstantTensorDescriptor.hpp"
namespace ck {
// need to assume src and dst is aligned
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
__device__ void threadwise_tensor_slice_copy(SrcDesc,
......@@ -192,3 +196,6 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
});
});
}
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment