Commit b2888adf authored by Chao Liu's avatar Chao Liu
Browse files

change file extension to hip.hpp and hip.cpp

parent a414e3fd
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_1.cuh"
#include "gridwise_direct_convolution_1.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_1(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2.cuh"
#include "gridwise_direct_convolution_2.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
......
......@@ -4,17 +4,17 @@
#include <cstdlib>
#include "config.h"
#include "tensor.hpp"
#include "ConstantTensorDescriptor.cuh"
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
//#include "device_winograd_convolution.cuh"
#include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp"
#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2.hpp"
#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp"
#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp"
//#include "device_winograd_convolution.hip.hpp"
struct GeneratorTensor_1
{
......
#pragma once
#include "common.cuh"
#include "common.hip.hpp"
template <unsigned NRow_, unsigned NCol_, unsigned RowStride_>
struct ConstantMatrixDescriptor
......
#pragma once
#include "common.cuh"
#include "common.hip.hpp"
// this is ugly, only for 2d
template <unsigned L0, unsigned L1>
......
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.hip.hpp"
template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void
......
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.hip.hpp"
template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void
......@@ -245,11 +245,10 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp =
p_src +
src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin);
p_src + src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin);
#if 0
if(get_thread_local_1d_id() == 0)
......
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
template <unsigned BlockSize,
class Float,
......@@ -95,11 +95,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc,
p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc,
p_out_thread,
out_thread_desc.GetLengths());
......@@ -110,11 +109,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution
threadwise_direct_convolution_2(
in_thread_block_desc,
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data_begin,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data_begin,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block +
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
......@@ -126,11 +124,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy(out_thread_desc,
p_out_thread,
out_block_desc,
p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc.GetLengths());
}
}
#pragma once
#include "threadwise_gemm.cuh"
#include "threadwise_gemm.hip.hpp"
template <unsigned BlockSize,
class BlockMatrixA,
......@@ -305,9 +305,8 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NClusterWork =
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
static_assert(BlockSize ==
(MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster),
static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster),
"wrong! wrong BlockSize");
if(DistributeThreadAlongColumnFirst)
......@@ -907,9 +906,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
c_thread_sub_mtx,
False,
p_c_thread +
c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC),
p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC),
f_accum);
}
}
......
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.hip.hpp"
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
......
......@@ -42,8 +42,8 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
timer.Start();
......
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
template <class Float,
class InGlobalDesc,
......@@ -147,11 +147,10 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock)
{
// copy input tensor to LDS
blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
p_in_block);
// copy weight tensor to LDS
......@@ -178,9 +177,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
}
// copy output tensor from LDS to device mem
blockwise_out_copy.Run(
p_out_block,
p_out_global +
out_global_desc.Get1dIndex(
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
blockwise_out_copy.Run(p_out_block,
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin));
}
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
template <class Float,
class InGlobalDesc,
......@@ -163,11 +163,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin += CPerBlock, __syncthreads())
{
// copy input tensor to LDS
blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_block);
// copy weight tensor to LDS
......@@ -183,11 +182,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#if 1
threadwise_direct_convolution_2(
in_thread_block_desc,
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc,
......@@ -195,11 +193,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0
threadwise_direct_convolution_3(
in_thread_block_desc,
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc,
......@@ -213,10 +210,9 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc,
p_out_thread,
out_global_desc,
p_out_global +
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_thread_desc.GetLengths());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment