Commit b2888adf authored by Chao Liu's avatar Chao Liu
Browse files

change file extension to hip.hpp and hip.cpp

parent a414e3fd
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_direct_convolution_1.cuh" #include "gridwise_direct_convolution_1.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_1(InDesc, void device_direct_convolution_1(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_direct_convolution_2.cuh" #include "gridwise_direct_convolution_2.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2(InDesc, void device_direct_convolution_2(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" #include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc, void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
......
...@@ -4,17 +4,17 @@ ...@@ -4,17 +4,17 @@
#include <cstdlib> #include <cstdlib>
#include "config.h" #include "config.h"
#include "tensor.hpp" #include "tensor.hpp"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.cuh" #include "conv_common.hip.hpp"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.hpp"
#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp"
#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" #include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp"
//#include "device_winograd_convolution.cuh" //#include "device_winograd_convolution.hip.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
......
#pragma once #pragma once
#include "common.cuh" #include "common.hip.hpp"
template <unsigned NRow_, unsigned NCol_, unsigned RowStride_> template <unsigned NRow_, unsigned NCol_, unsigned RowStride_>
struct ConstantMatrixDescriptor struct ConstantMatrixDescriptor
......
#pragma once #pragma once
#include "common.cuh" #include "common.hip.hpp"
// this is ugly, only for 2d // this is ugly, only for 2d
template <unsigned L0, unsigned L1> template <unsigned L0, unsigned L1>
......
#pragma once #pragma once
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
template <unsigned BlockSize, class Float, class DstDesc, class F> template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void __device__ void
......
#pragma once #pragma once
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
template <unsigned BlockSize, class Float, class DstDesc, class F> template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void __device__ void
...@@ -245,11 +245,10 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -245,11 +245,10 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp = const Float* p_src_tmp =
p_src + p_src + src_desc.Get1dIndex(c_block_data_begin,
src_desc.Get1dIndex(c_block_data_begin, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low, n_block_data_begin);
n_block_data_begin);
#if 0 #if 0
if(get_thread_local_1d_id() == 0) if(get_thread_local_1d_id() == 0)
......
#pragma once #pragma once
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_4d_tensor_op.cuh" #include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.cuh" #include "threadwise_direct_convolution.hip.hpp"
template <unsigned BlockSize, template <unsigned BlockSize,
class Float, class Float,
...@@ -95,11 +95,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -95,11 +95,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc, threadwise_4d_tensor_copy(out_block_desc,
p_out_block + p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
out_block_desc.Get1dIndex(n_thread_data_begin, k_thread_data_begin,
k_thread_data_begin, ho_thread_data_begin,
ho_thread_data_begin, wo_thread_data_begin),
wo_thread_data_begin),
out_thread_desc, out_thread_desc,
p_out_thread, p_out_thread,
out_thread_desc.GetLengths()); out_thread_desc.GetLengths());
...@@ -110,11 +109,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -110,11 +109,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution // threadwise convolution
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
in_block_desc.Get1dIndex(n_thread_data_begin, c_thread_data_begin,
c_thread_data_begin, hi_thread_data_begin,
hi_thread_data_begin, wi_thread_data_begin),
wi_thread_data_begin),
wei_thread_block_desc, wei_thread_block_desc,
p_wei_block + p_wei_block +
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
...@@ -126,11 +124,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -126,11 +124,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy(out_thread_desc, threadwise_4d_tensor_copy(out_thread_desc,
p_out_thread, p_out_thread,
out_block_desc, out_block_desc,
p_out_block + p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
out_block_desc.Get1dIndex(n_thread_data_begin, k_thread_data_begin,
k_thread_data_begin, ho_thread_data_begin,
ho_thread_data_begin, wo_thread_data_begin),
wo_thread_data_begin),
out_thread_desc.GetLengths()); out_thread_desc.GetLengths());
} }
} }
#pragma once #pragma once
#include "threadwise_gemm.cuh" #include "threadwise_gemm.hip.hpp"
template <unsigned BlockSize, template <unsigned BlockSize,
class BlockMatrixA, class BlockMatrixA,
...@@ -305,9 +305,8 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -305,9 +305,8 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NClusterWork = constexpr unsigned NClusterWork =
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
static_assert(BlockSize == static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
(MClusterWork * MThreadPerCluster) * (NClusterWork * NThreadPerCluster),
(NClusterWork * NThreadPerCluster),
"wrong! wrong BlockSize"); "wrong! wrong BlockSize");
if(DistributeThreadAlongColumnFirst) if(DistributeThreadAlongColumnFirst)
...@@ -907,9 +906,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -907,9 +906,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
c_thread_sub_mtx, c_thread_sub_mtx,
False, False,
p_c_thread + p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, n_repeat * NPerThreadSubC),
n_repeat * NPerThreadSubC),
f_accum); f_accum);
} }
} }
......
#pragma once #pragma once
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
// this is ugly, only for 4d // this is ugly, only for 4d
template <class InDesc, class WeiDesc> template <class InDesc, class WeiDesc>
......
...@@ -42,8 +42,8 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args) ...@@ -42,8 +42,8 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
hipGetErrorString(hipGetLastError()); hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
const void* f = reinterpret_cast<const void*>(kernel); const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...}; void* p_args[] = {&args...};
timer.Start(); timer.Start();
......
#pragma once #pragma once
#include "common.cuh" #include "common.hip.hpp"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.cuh" #include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.cuh" #include "blockwise_direct_convolution.hip.hpp"
template <class Float, template <class Float,
class InGlobalDesc, class InGlobalDesc,
...@@ -147,11 +147,10 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -147,11 +147,10 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
in_global_desc.Get1dIndex(n_block_work_begin, c_block_work_begin,
c_block_work_begin, hi_block_work_begin,
hi_block_work_begin, wi_block_work_begin),
wi_block_work_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
...@@ -178,9 +177,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -178,9 +177,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
} }
// copy output tensor from LDS to device mem // copy output tensor from LDS to device mem
blockwise_out_copy.Run( blockwise_out_copy.Run(p_out_block,
p_out_block, p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
p_out_global + k_block_work_begin,
out_global_desc.Get1dIndex( ho_block_work_begin,
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); wo_block_work_begin));
} }
#pragma once #pragma once
#include "common.cuh" #include "common.hip.hpp"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.cuh" #include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.cuh" #include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.cuh" #include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.cuh" #include "threadwise_direct_convolution.hip.hpp"
template <class Float, template <class Float,
class InGlobalDesc, class InGlobalDesc,
...@@ -163,11 +163,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -163,11 +163,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
in_global_desc.Get1dIndex(n_block_data_begin, c_block_data_begin,
c_block_data_begin, hi_block_data_begin,
hi_block_data_begin, wi_block_data_begin),
wi_block_data_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
...@@ -183,11 +182,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -183,11 +182,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
in_block_desc.Get1dIndex(n_thread_data_begin, c_thread_data,
c_thread_data, hi_thread_data_begin,
hi_thread_data_begin, wi_thread_data_begin),
wi_thread_data_begin),
wei_thread_block_desc, wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc, out_thread_desc,
...@@ -195,11 +193,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -195,11 +193,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_thread_block_desc, in_thread_block_desc,
p_in_block + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
in_block_desc.Get1dIndex(n_thread_data_begin, c_thread_data,
c_thread_data, hi_thread_data_begin,
hi_thread_data_begin, wi_thread_data_begin),
wi_thread_data_begin),
wei_thread_block_desc, wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc, out_thread_desc,
...@@ -213,10 +210,9 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -213,10 +210,9 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc, out_thread_desc,
p_out_thread, p_out_thread,
out_global_desc, out_global_desc,
p_out_global + p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, k_block_data_begin + k_thread_data_begin,
k_block_data_begin + k_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, wo_block_data_begin + wo_thread_data_begin),
wo_block_data_begin + wo_thread_data_begin),
out_thread_desc.GetLengths()); out_thread_desc.GetLengths());
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment