"...composable_kernel-1.git" did not exist on "53ea4713af15e43f5b11816f20c56f6fc9c7611f"
Commit e43d7bc6 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent d058d164
...@@ -270,7 +270,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -270,7 +270,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel( constexpr auto gridwise_conv =
#if 1 #if 1
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
#else #else
...@@ -301,12 +301,14 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -301,12 +301,14 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyThreadPerDim0, WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1, WeiBlockCopyThreadPerDim1,
InBlockCopyDataPerRead, InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead>, WeiBlockCopyDataPerRead>();
dim3(GridSize),
dim3(BlockSize), float time = launch_kernel(gridwise_conv.Run,
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()), dim3(GridSize),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()), dim3(BlockSize),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer())); static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000))); usleep(std::min(time * 1000, float(10000)));
......
...@@ -580,7 +580,7 @@ int main(int argc, char* argv[]) ...@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 1x1 filter, 14x14 image, C = 2048 // 1x1 filter, 14x14 image, C = 2048
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 2048; constexpr index_t C = 2048;
......
...@@ -137,7 +137,10 @@ struct ConstantTensorDescriptor ...@@ -137,7 +137,10 @@ struct ConstantTensorDescriptor
} }
}; };
return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get(); index_t element_space_unaligned =
static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + 1;
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
} }
template <class... Is> template <class... Is>
......
This diff is collapsed.
...@@ -5,8 +5,6 @@ ...@@ -5,8 +5,6 @@
#include "Array.hip.hpp" #include "Array.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_block_1d_id() { return blockIdx.x; }
...@@ -23,21 +21,45 @@ struct is_same<T, T> ...@@ -23,21 +21,45 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
#if DEVICE_BACKEND_CUDA __host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
template <typename T>
__host__ __device__ constexpr T max(T a, T b)
{ {
return a > b ? a : b; return (a + b - 1) / b;
} }
template <typename T> namespace mod_conv {
__host__ __device__ constexpr T min(T a, T b) template <class T>
__host__ __device__ constexpr T max(T x, T y)
{ {
return a < b ? a : b; return x > y ? x : y;
} }
#endif
__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b) template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{ {
return (a + b - 1) / b; static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x, T y)
{
return x < y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x < y ? x : y;
}
} }
...@@ -59,12 +59,12 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -59,12 +59,12 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
constexpr auto out_block_desc = constexpr auto out_block_desc =
make_ConstantTensorDescriptor(out_block_global_desc.GetLengths()); make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());
constexpr index_t in_block_size = in_block_desc.GetElementSpace(); constexpr index_t in_block_element_size = in_block_desc.GetElementSpace();
constexpr index_t wei_block_size = wei_block_desc.GetElementSpace(); constexpr index_t wei_block_element_size = wei_block_desc.GetElementSpace();
constexpr index_t out_block_size = out_block_desc.GetElementSpace(); constexpr index_t out_block_size = out_block_desc.GetElementSpace();
__shared__ Float p_in_block[in_block_size]; __shared__ Float p_in_block[in_block_element_size];
__shared__ Float p_wei_block[wei_block_size]; __shared__ Float p_wei_block[wei_block_element_size];
__shared__ Float p_out_block[out_block_size]; __shared__ Float p_out_block[out_block_size];
const index_t block_id = blockIdx.x; const index_t block_id = blockIdx.x;
......
...@@ -63,17 +63,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i ...@@ -63,17 +63,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{}); Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
// shared mem // shared mem
constexpr index_t in_block_size = constexpr index_t in_block_element_size =
in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{}); in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr index_t wei_block_size = constexpr index_t wei_block_element_size =
wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{}); wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead ? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead; : WeiBlockCopyDataPerRead;
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; __shared__ Float p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; __shared__ Float
p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
// threadwise tensors // threadwise tensors
constexpr index_t HiPerThread = HoPerThread + Y - 1; constexpr index_t HiPerThread = HoPerThread + Y - 1;
......
...@@ -73,10 +73,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -73,10 +73,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{}); Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});
// shared mem // shared mem
constexpr index_t in_block_size = constexpr index_t in_block_element_size =
in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{}); in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr index_t wei_block_size = constexpr index_t wei_block_element_size =
wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{}); wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
...@@ -84,9 +84,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -84,9 +84,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
: WeiBlockCopyDataPerRead; : WeiBlockCopyDataPerRead;
__shared__ in_vector_mem_t __shared__ in_vector_mem_t
p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; p_in_vec_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
__shared__ in_vector_mem_t __shared__ in_vector_mem_t
p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; p_wei_vec_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
// threadwise tensors // threadwise tensors
constexpr index_t HiPerThread = HoPerThread + Y - 1; constexpr index_t HiPerThread = HoPerThread + Y - 1;
......
...@@ -164,18 +164,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -164,18 +164,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
HoPerThread>{}; HoPerThread>{};
// LDS: be careful of alignment // LDS: be careful of alignment
constexpr index_t in_block_size = constexpr index_t in_block_element_size =
in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{}); in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr index_t wei_block_size = constexpr index_t wei_block_element_size =
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{}); wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead ? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead; : WeiBlockCopyDataPerRead;
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; __shared__ Float p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; __shared__ Float
p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
// register // register
Float p_out_thread[out_khwn_thread_desc.GetElementSpace()]; Float p_out_thread[out_khwn_thread_desc.GetElementSpace()];
......
...@@ -204,11 +204,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( ...@@ -204,11 +204,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
true>{}; true>{};
// LDS // LDS
constexpr index_t in_block_size = in_chwn_block_desc.GetElementSpace(); constexpr index_t in_block_element_size = in_chwn_block_desc.GetElementSpace();
constexpr index_t wei_block_size = wei_cyxk_block_desc.GetElementSpace(); constexpr index_t wei_block_element_size = wei_cyxk_block_desc.GetElementSpace();
__shared__ Float p_in_block[in_block_size]; __shared__ Float p_in_block[in_block_element_size];
__shared__ Float p_wei_block[wei_block_size]; __shared__ Float p_wei_block[wei_block_element_size];
// register // register
Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()]; Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];
......
...@@ -10,11 +10,9 @@ __device__ void threadwise_matrix_copy(SrcMatrix, ...@@ -10,11 +10,9 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
constexpr auto src_mtx = SrcMatrix{}; constexpr auto src_mtx = SrcMatrix{};
constexpr auto dst_mtx = DstMatrix{}; constexpr auto dst_mtx = DstMatrix{};
#if 1 #if 0
//NRow = 1
for(index_t i = 0; i < NRow; ++i) for(index_t i = 0; i < NRow; ++i)
{ {
//NCol = 4
for(index_t j = 0; j < NCol; ++j) for(index_t j = 0; j < NCol; ++j)
{ {
const index_t src_index = src_mtx.Get1dIndex(i, j); const index_t src_index = src_mtx.Get1dIndex(i, j);
...@@ -23,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix, ...@@ -23,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
p_dst[dst_index] = p_src[src_index]; p_dst[dst_index] = p_src[src_index];
} }
} }
#elif 0 #elif 1
static_assert(NCol == 4, "only for NCol == 4"); static_assert(NCol == 4, "only for NCol == 4");
using vector_t = typename vector_type<Float, 4>::MemoryType; using vector_t = typename vector_type<Float, 4>::MemoryType;
...@@ -33,22 +31,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix, ...@@ -33,22 +31,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
const index_t src_index = src_mtx.Get1dIndex(i, 0); const index_t src_index = src_mtx.Get1dIndex(i, 0);
const index_t dst_index = dst_mtx.Get1dIndex(i, 0); const index_t dst_index = dst_mtx.Get1dIndex(i, 0);
#if 0 *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
*(reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*(reinterpret_cast<const vector_t*>(&p_src[src_index])); *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
#elif 0
asm volatile("\n \
ds_read2_b64 %0, %1 offset1:1 \n \
s_waitcnt lgkmcnt(0)"
: "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
: "v"(__to_local((void*)(&p_src[src_index]))));
#elif 1
asm volatile("\n \
ds_read_b128 %0, %1 \n \
s_waitcnt lgkmcnt(0)"
: "=v"(*(reinterpret_cast<vector_t*>(&p_dst[dst_index])))
: "v"(__to_local((void*)(&p_src[src_index]))));
#endif
} }
#endif #endif
} }
...@@ -84,13 +68,10 @@ __device__ void threadwise_gemm(MatrixA, ...@@ -84,13 +68,10 @@ __device__ void threadwise_gemm(MatrixA,
constexpr index_t N = c_mtx.NCol(); constexpr index_t N = c_mtx.NCol();
constexpr index_t K = a_mtx.NRow(); // A is transposed constexpr index_t K = a_mtx.NRow(); // A is transposed
// K = 1
for(index_t k = 0; k < K; ++k) for(index_t k = 0; k < K; ++k)
{ {
// M = 8
for(index_t i = 0; i < M; ++i) for(index_t i = 0; i < M; ++i)
{ {
// N = 8
for(index_t j = 0; j < N; ++j) for(index_t j = 0; j < N; ++j)
{ {
const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment