"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "85c1ff1ceae66003a06d445126e17af4bc683ae4"
Commit 2c9b8c24 authored by Chao Liu's avatar Chao Liu
Browse files

update hip build

parent 0c88a3d8
...@@ -47,7 +47,7 @@ struct GeneratorTensor_3 ...@@ -47,7 +47,7 @@ struct GeneratorTensor_3
std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...}; std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
std::vector<std::size_t> lens(sizeof...(Is), 100); std::vector<std::size_t> lens(sizeof...(Is), 100);
std::vector<std::size_t> strides(sizeof...(Is), 1); std::vector<std::size_t> strides(sizeof...(Is), 1);
std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1); std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
#endif #endif
} }
......
...@@ -245,7 +245,8 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -245,7 +245,8 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp = const Float* p_src_tmp =
p_src + src_desc.Get1dIndex(c_block_data_begin, p_src +
src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin); n_block_data_begin);
......
...@@ -93,7 +93,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -93,7 +93,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc, threadwise_4d_tensor_copy(out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin, k_thread_data_begin,
ho_thread_data_begin, ho_thread_data_begin,
wo_thread_data_begin), wo_thread_data_begin),
...@@ -107,7 +108,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -107,7 +108,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution // threadwise convolution
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data_begin, c_thread_data_begin,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
...@@ -122,7 +124,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -122,7 +124,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy(out_thread_desc, threadwise_4d_tensor_copy(out_thread_desc,
p_out_thread, p_out_thread,
out_block_desc, out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin, k_thread_data_begin,
ho_thread_data_begin, ho_thread_data_begin,
wo_thread_data_begin), wo_thread_data_begin),
......
...@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
// loop over k // loop over k
#pragma unroll #pragma unroll
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
{ {
// read first batch of A, B // read first batch of A, B
// copy A-sub to form A // copy A-sub to form A
#pragma unroll #pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
...@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths());
} }
// copy B-sub to form B // copy B-sub to form B
#pragma unroll #pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
...@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_sub_mtx.GetLengths()); b_thread_sub_mtx.GetLengths());
} }
// loop over batch // loop over batch
#pragma unroll #pragma unroll
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
{ {
...@@ -557,7 +557,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -557,7 +557,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
{ {
threadwise_matrix_copy( threadwise_matrix_copy(
c_thread_sub_mtx, c_thread_sub_mtx,
p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, p_c_thread +
c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
n_repeat * NPerLevel1Cluster), n_repeat * NPerLevel1Cluster),
c_block_mtx, c_block_mtx,
p_c_block + p_c_block +
...@@ -656,7 +657,8 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -656,7 +657,8 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NClusterWork = constexpr unsigned NClusterWork =
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * static_assert(BlockSize ==
(MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster), (NClusterWork * NThreadPerCluster),
"wrong! wrong BlockSize"); "wrong! wrong BlockSize");
...@@ -1256,7 +1258,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -1256,7 +1258,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
c_thread_sub_mtx, c_thread_sub_mtx,
False, False,
p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, p_c_thread +
c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC), n_repeat * NPerThreadSubC),
f_accum); f_accum);
} }
......
...@@ -65,7 +65,7 @@ struct vector_type<half_float::half, 8> ...@@ -65,7 +65,7 @@ struct vector_type<half_float::half, 8>
}; };
#endif #endif
#if 1 #if 0
template <> template <>
struct vector_type<half, 1> struct vector_type<half, 1>
{ {
...@@ -139,6 +139,7 @@ struct Sequence ...@@ -139,6 +139,7 @@ struct Sequence
} }
}; };
#if DEVICE_BACKEND_CUDA
template <typename T> template <typename T>
__host__ __device__ constexpr T max(T a, T b) __host__ __device__ constexpr T max(T a, T b)
{ {
...@@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b) ...@@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b)
{ {
return a < b ? a : b; return a < b ? a : b;
} }
#endif
__host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b) __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b)
{ {
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
......
...@@ -113,7 +113,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -113,7 +113,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin, c_block_work_begin,
hi_block_work_begin, hi_block_work_begin,
wi_block_work_begin), wi_block_work_begin),
...@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
} }
// copy output tensor from LDS to device mem // copy output tensor from LDS to device mem
blockwise_out_copy.Run(p_out_block, blockwise_out_copy.Run(
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, p_out_block,
k_block_work_begin, p_out_global +
ho_block_work_begin, out_global_desc.Get1dIndex(
wo_block_work_begin)); n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
} }
...@@ -139,7 +139,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -139,7 +139,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
...@@ -158,7 +159,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -158,7 +159,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
...@@ -169,7 +171,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -169,7 +171,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
...@@ -186,7 +189,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -186,7 +189,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc, out_thread_desc,
p_out_thread, p_out_thread,
out_global_desc, out_global_desc,
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, p_out_global +
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
......
...@@ -184,7 +184,8 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -184,7 +184,8 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
const Float* p_in_global_block_begin = const Float* p_in_global_block_begin =
p_in_global + in_chwn_global_desc.Get1dIndex( p_in_global +
in_chwn_global_desc.Get1dIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_begin = const Float* p_wei_global_block_begin =
...@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
} }
} }
// output: register to global mem, // output: register to global mem,
#if 0 #if 0
const auto c_thread_mtx_begin = const auto c_thread_mtx_begin =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...@@ -286,11 +287,12 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -286,11 +287,12 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
} }
#endif #endif
threadwise_8d_tensor_copy(out_8d_thread_desc, threadwise_8d_tensor_copy(
out_8d_thread_desc,
p_out_thread, p_out_thread,
out_8d_global_desc, out_8d_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex( p_out_global +
k_block_data_begin + k_thread_data_begin, out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( ...@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_khwn_global_desc, out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) ...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return os; return os;
} }
typedef enum typedef enum {
{
Half = 0, Half = 0,
Float = 1, Float = 1,
} DataType_t; } DataType_t;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment