Commit 629ae8bc authored by Jianfeng yan's avatar Jianfeng yan
Browse files

changed long_index_t to index_t when computing memory offset

parent 9009307f
...@@ -44,7 +44,7 @@ using CElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -44,7 +44,7 @@ using CElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
// clang-format off // clang-format off
#if 0 #if 1
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num| //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| Prefetch| //######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| Prefetch|
...@@ -52,7 +52,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl ...@@ -52,7 +52,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// [256, 128, 4, 8], 1 stage, 2 occupancy // [256, 128, 4, 8], 1 stage, 2 occupancy
< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1, 1>; < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1, 1>;
#elif 1 #elif 0
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
//######|AData| BData| CData| AccData| Shuffle| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| //######|AData| BData| CData| AccData| Shuffle| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| Type| Type| Type| Type| Data| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| //######| Type| Type| Type| Type| Data| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
......
...@@ -50,12 +50,19 @@ __global__ void ...@@ -50,12 +50,19 @@ __global__ void
__builtin_amdgcn_readfirstlane(get_grid_size() / num_batches); __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( // const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx))); // static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( // const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx))); // static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( // const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx))); // static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
const index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
const index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
...@@ -247,26 +254,26 @@ struct DeviceBatchedGemmXdl ...@@ -247,26 +254,26 @@ struct DeviceBatchedGemmXdl
index_t num_batches_; index_t num_batches_;
}; };
struct ComputeBasePtrOfBatch struct ComputeBasePtrOfStridedBatch
{ {
ComputeBasePtrOfBatch(index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC) ComputeBasePtrOfStridedBatch(index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC)
: BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC) : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
{ {
} }
__host__ __device__ constexpr index_t GetABasePtr(index_t g_idx) const __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
{ {
return g_idx * BatchStrideA_; return g_idx * static_cast<long_index_t>(BatchStrideA_);
} }
__host__ __device__ constexpr index_t GetBBasePtr(index_t g_idx) const __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
{ {
return g_idx * BatchStrideB_; return g_idx * static_cast<long_index_t>(BatchStrideB_);
} }
__host__ __device__ constexpr index_t GetCBasePtr(index_t g_idx) const __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
{ {
return g_idx * BatchStrideC_; return g_idx * static_cast<long_index_t>(BatchStrideC_);
} }
private: private:
...@@ -381,7 +388,7 @@ struct DeviceBatchedGemmXdl ...@@ -381,7 +388,7 @@ struct DeviceBatchedGemmXdl
BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
CGridDesc_M_N c_grid_desc_m_n_; CGridDesc_M_N c_grid_desc_m_n_;
CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_; CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
ComputeBasePtrOfBatch compute_base_ptr_of_batch_; ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
Block2CTileMap block_2_ctile_map_; Block2CTileMap block_2_ctile_map_;
index_t M01_; index_t M01_;
index_t N01_; index_t N01_;
...@@ -441,7 +448,7 @@ struct DeviceBatchedGemmXdl ...@@ -441,7 +448,7 @@ struct DeviceBatchedGemmXdl
AElementwiseOperation, AElementwiseOperation,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation, CElementwiseOperation,
ComputeBasePtrOfBatch, ComputeBasePtrOfStridedBatch,
remove_reference_t<Block2CTileMap>, remove_reference_t<Block2CTileMap>,
true>; true>;
...@@ -475,7 +482,7 @@ struct DeviceBatchedGemmXdl ...@@ -475,7 +482,7 @@ struct DeviceBatchedGemmXdl
AElementwiseOperation, AElementwiseOperation,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation, CElementwiseOperation,
ComputeBasePtrOfBatch, ComputeBasePtrOfStridedBatch,
remove_reference_t<Block2CTileMap>, remove_reference_t<Block2CTileMap>,
false>; false>;
......
...@@ -28,23 +28,24 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = ...@@ -28,23 +28,24 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances =
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//#################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //#################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//#################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //#################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 256, 4, 8, 32, 32, 1, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 128, 4, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 256, 4, 8, 32, 32, 1, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 64, 4, 8, 32, 32, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 128, 4, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 64, 32, 32, 4, 8, 32, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 32, 64, 4, 8, 32, 32, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 256, 4, 8, 16, 16, 1, 8, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 64, 32, 32, 4, 8, 32, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 128, 4, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 256, 4, 8, 16, 16, 1, 8, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 64, 4, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 128, 4, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 32, 4, 8, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 64, 4, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 64, 16, 16, 4, 8, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1> // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 128, 16, 32, 4, 8, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>,
// DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, 64, 16, 16, 4, 8, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>
// clang-format on // clang-format on
>; >;
......
...@@ -27,19 +27,19 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple< ...@@ -27,19 +27,19 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//#################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //#################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//#################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //#################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 64, 64, 4, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 64, 64, 4, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 32, 4, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 128, 32, 4, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 32, 128, 4, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 128, 32, 128, 4, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 64, 32, 4, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>, // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 64, 32, 4, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>,
DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 32, 64, 4, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1> // DeviceBatchedGemmXdl< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 64, 32, 64, 4, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>
// clang-format on // clang-format on
>; >;
......
...@@ -22,30 +22,30 @@ include_directories(BEFORE ...@@ -22,30 +22,30 @@ include_directories(BEFORE
# ck_profiler # ck_profiler
set(PROFILER_SOURCE set(PROFILER_SOURCE
src/profiler.cpp src/profiler.cpp
src/profile_gemm.cpp # src/profile_gemm.cpp
src/profile_gemm_bias_2d.cpp # src/profile_gemm_bias_2d.cpp
src/profile_gemm_bias_relu.cpp # src/profile_gemm_bias_relu.cpp
src/profile_gemm_bias_relu_add.cpp # src/profile_gemm_bias_relu_add.cpp
src/profile_batched_gemm.cpp src/profile_batched_gemm.cpp
src/profile_conv_fwd.cpp # src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp # src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp # src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp # src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_conv_bwd_data.cpp # src/profile_conv_bwd_data.cpp
src/profile_reduce.cpp # src/profile_reduce.cpp
) )
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE host_tensor) target_link_libraries(ckProfiler PRIVATE host_tensor)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance) # target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) # target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) # target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) # target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) # target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) # target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) # target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) # target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance) # target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) # target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
...@@ -4,64 +4,69 @@ ...@@ -4,64 +4,69 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
int profile_gemm(int, char*[]); // int profile_gemm(int, char*[]);
int profile_batched_gemm(int, char*[]); int profile_batched_gemm(int, char*[]);
int profile_gemm_bias_2d(int, char*[]); // int profile_gemm_bias_2d(int, char*[]);
int profile_gemm_bias_relu(int, char*[]); // int profile_gemm_bias_relu(int, char*[]);
int profile_gemm_bias_relu_add(int, char*[]); // int profile_gemm_bias_relu_add(int, char*[]);
int profile_conv_fwd(int, char*[]); // int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); // int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); // int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); // int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]); // int profile_conv_bwd_data(int, char*[]);
int profile_reduce(int, char*[]); // int profile_reduce(int, char*[]);
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
if(strcmp(argv[1], "gemm") == 0) // if(strcmp(argv[1], "gemm") == 0)
{ // {
return profile_gemm(argc, argv); // return profile_gemm(argc, argv);
} // }
else if(strcmp(argv[1], "gemm_bias_2d") == 0) // else if(strcmp(argv[1], "gemm_bias_2d") == 0)
{ // {
return profile_gemm_bias_2d(argc, argv); // return profile_gemm_bias_2d(argc, argv);
} // }
else if(strcmp(argv[1], "gemm_bias_relu") == 0) // else if(strcmp(argv[1], "gemm_bias_relu") == 0)
{ // {
return profile_gemm_bias_relu(argc, argv); // return profile_gemm_bias_relu(argc, argv);
} // }
else if(strcmp(argv[1], "gemm_bias_relu_add") == 0) // else if(strcmp(argv[1], "gemm_bias_relu_add") == 0)
{ // {
return profile_gemm_bias_relu_add(argc, argv); // return profile_gemm_bias_relu_add(argc, argv);
} // }
else if(strcmp(argv[1], "batched_gemm") == 0) // else if(strcmp(argv[1], "batched_gemm") == 0)
// {
// return profile_batched_gemm(argc, argv);
// }
// else if(strcmp(argv[1], "conv_fwd") == 0)
// {
// return profile_conv_fwd(argc, argv);
// }
// else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
// {
// return profile_conv_fwd_bias_relu(argc, argv);
// }
// else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
// {
// return profile_conv_fwd_bias_relu_add(argc, argv);
// }
// else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
// {
// return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
// }
// else if(strcmp(argv[1], "conv_bwd") == 0)
// {
// return profile_conv_bwd_data(argc, argv);
// }
// else if(strcmp(argv[1], "reduce") == 0)
// {
// return profile_reduce(argc, argv);
// }
if(strcmp(argv[1], "batched_gemm") == 0)
{ {
return profile_batched_gemm(argc, argv); return profile_batched_gemm(argc, argv);
} }
else if(strcmp(argv[1], "conv_fwd") == 0)
{
return profile_conv_fwd(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
{
return profile_conv_fwd_bias_relu(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
{
return profile_conv_fwd_bias_relu_add(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
{
return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
}
else if(strcmp(argv[1], "conv_bwd") == 0)
{
return profile_conv_bwd_data(argc, argv);
}
else if(strcmp(argv[1], "reduce") == 0)
{
return profile_reduce(argc, argv);
}
else else
{ {
// clang-format off // clang-format off
......
#!/bin/bash #!/bin/bash
FILE=$1 FILE=$1
echo v0 $( grep -w v0 $FILE | wc -l ) for i in `seq 0 250`
echo v1 $( grep -w v1 $FILE | wc -l ) do
echo v2 $( grep -w v2 $FILE | wc -l ) echo v${i} $( grep -w v${i} $FILE | wc -l )
echo v3 $( grep -w v3 $FILE | wc -l ) done
echo v4 $( grep -w v4 $FILE | wc -l )
echo v5 $( grep -w v5 $FILE | wc -l )
echo v6 $( grep -w v6 $FILE | wc -l )
echo v7 $( grep -w v7 $FILE | wc -l )
echo v8 $( grep -w v8 $FILE | wc -l )
echo v9 $( grep -w v9 $FILE | wc -l )
echo v10 $( grep -w v10 $FILE | wc -l )
echo v11 $( grep -w v11 $FILE | wc -l )
echo v12 $( grep -w v12 $FILE | wc -l )
echo v13 $( grep -w v13 $FILE | wc -l )
echo v14 $( grep -w v14 $FILE | wc -l )
echo v15 $( grep -w v15 $FILE | wc -l )
echo v16 $( grep -w v16 $FILE | wc -l )
echo v17 $( grep -w v17 $FILE | wc -l )
echo v18 $( grep -w v18 $FILE | wc -l )
echo v19 $( grep -w v19 $FILE | wc -l )
echo v20 $( grep -w v20 $FILE | wc -l )
echo v21 $( grep -w v21 $FILE | wc -l )
echo v22 $( grep -w v22 $FILE | wc -l )
echo v23 $( grep -w v23 $FILE | wc -l )
echo v24 $( grep -w v24 $FILE | wc -l )
echo v25 $( grep -w v25 $FILE | wc -l )
echo v26 $( grep -w v26 $FILE | wc -l )
echo v27 $( grep -w v27 $FILE | wc -l )
echo v28 $( grep -w v28 $FILE | wc -l )
echo v29 $( grep -w v29 $FILE | wc -l )
echo v30 $( grep -w v30 $FILE | wc -l )
echo v31 $( grep -w v31 $FILE | wc -l )
echo v32 $( grep -w v32 $FILE | wc -l )
echo v33 $( grep -w v33 $FILE | wc -l )
echo v34 $( grep -w v34 $FILE | wc -l )
echo v35 $( grep -w v35 $FILE | wc -l )
echo v36 $( grep -w v36 $FILE | wc -l )
echo v37 $( grep -w v37 $FILE | wc -l )
echo v38 $( grep -w v38 $FILE | wc -l )
echo v39 $( grep -w v39 $FILE | wc -l )
echo v40 $( grep -w v40 $FILE | wc -l )
echo v41 $( grep -w v41 $FILE | wc -l )
echo v42 $( grep -w v42 $FILE | wc -l )
echo v43 $( grep -w v43 $FILE | wc -l )
echo v44 $( grep -w v44 $FILE | wc -l )
echo v45 $( grep -w v45 $FILE | wc -l )
echo v46 $( grep -w v46 $FILE | wc -l )
echo v47 $( grep -w v47 $FILE | wc -l )
echo v48 $( grep -w v48 $FILE | wc -l )
echo v49 $( grep -w v49 $FILE | wc -l )
echo v50 $( grep -w v50 $FILE | wc -l )
echo v51 $( grep -w v51 $FILE | wc -l )
echo v52 $( grep -w v52 $FILE | wc -l )
echo v53 $( grep -w v53 $FILE | wc -l )
echo v54 $( grep -w v54 $FILE | wc -l )
echo v55 $( grep -w v55 $FILE | wc -l )
echo v56 $( grep -w v56 $FILE | wc -l )
echo v57 $( grep -w v57 $FILE | wc -l )
echo v58 $( grep -w v58 $FILE | wc -l )
echo v59 $( grep -w v59 $FILE | wc -l )
echo v60 $( grep -w v60 $FILE | wc -l )
echo v61 $( grep -w v61 $FILE | wc -l )
echo v62 $( grep -w v62 $FILE | wc -l )
echo v63 $( grep -w v63 $FILE | wc -l )
echo v64 $( grep -w v64 $FILE | wc -l )
echo v65 $( grep -w v65 $FILE | wc -l )
echo v66 $( grep -w v66 $FILE | wc -l )
echo v67 $( grep -w v67 $FILE | wc -l )
echo v68 $( grep -w v68 $FILE | wc -l )
echo v69 $( grep -w v69 $FILE | wc -l )
echo v70 $( grep -w v70 $FILE | wc -l )
echo v71 $( grep -w v71 $FILE | wc -l )
echo v72 $( grep -w v72 $FILE | wc -l )
echo v73 $( grep -w v73 $FILE | wc -l )
echo v74 $( grep -w v74 $FILE | wc -l )
echo v75 $( grep -w v75 $FILE | wc -l )
echo v76 $( grep -w v76 $FILE | wc -l )
echo v77 $( grep -w v77 $FILE | wc -l )
echo v78 $( grep -w v78 $FILE | wc -l )
echo v79 $( grep -w v79 $FILE | wc -l )
echo v80 $( grep -w v80 $FILE | wc -l )
echo v81 $( grep -w v81 $FILE | wc -l )
echo v82 $( grep -w v82 $FILE | wc -l )
echo v83 $( grep -w v83 $FILE | wc -l )
echo v84 $( grep -w v84 $FILE | wc -l )
echo v85 $( grep -w v85 $FILE | wc -l )
echo v86 $( grep -w v86 $FILE | wc -l )
echo v87 $( grep -w v87 $FILE | wc -l )
echo v88 $( grep -w v88 $FILE | wc -l )
echo v89 $( grep -w v89 $FILE | wc -l )
echo v90 $( grep -w v90 $FILE | wc -l )
echo v91 $( grep -w v91 $FILE | wc -l )
echo v92 $( grep -w v92 $FILE | wc -l )
echo v93 $( grep -w v93 $FILE | wc -l )
echo v94 $( grep -w v94 $FILE | wc -l )
echo v95 $( grep -w v95 $FILE | wc -l )
echo v96 $( grep -w v96 $FILE | wc -l )
echo v97 $( grep -w v97 $FILE | wc -l )
echo v98 $( grep -w v98 $FILE | wc -l )
echo v99 $( grep -w v99 $FILE | wc -l )
echo v100 $( grep -w v100 $FILE | wc -l )
echo v101 $( grep -w v101 $FILE | wc -l )
echo v102 $( grep -w v102 $FILE | wc -l )
echo v103 $( grep -w v103 $FILE | wc -l )
echo v104 $( grep -w v104 $FILE | wc -l )
echo v105 $( grep -w v105 $FILE | wc -l )
echo v106 $( grep -w v106 $FILE | wc -l )
echo v107 $( grep -w v107 $FILE | wc -l )
echo v108 $( grep -w v108 $FILE | wc -l )
echo v109 $( grep -w v109 $FILE | wc -l )
echo v110 $( grep -w v110 $FILE | wc -l )
echo v111 $( grep -w v111 $FILE | wc -l )
echo v112 $( grep -w v112 $FILE | wc -l )
echo v113 $( grep -w v113 $FILE | wc -l )
echo v114 $( grep -w v114 $FILE | wc -l )
echo v115 $( grep -w v115 $FILE | wc -l )
echo v116 $( grep -w v116 $FILE | wc -l )
echo v117 $( grep -w v117 $FILE | wc -l )
echo v118 $( grep -w v118 $FILE | wc -l )
echo v119 $( grep -w v119 $FILE | wc -l )
echo v120 $( grep -w v120 $FILE | wc -l )
echo v121 $( grep -w v121 $FILE | wc -l )
echo v122 $( grep -w v122 $FILE | wc -l )
echo v123 $( grep -w v123 $FILE | wc -l )
echo v124 $( grep -w v124 $FILE | wc -l )
echo v125 $( grep -w v125 $FILE | wc -l )
echo v126 $( grep -w v126 $FILE | wc -l )
echo v127 $( grep -w v127 $FILE | wc -l )
echo v128 $( grep -w v128 $FILE | wc -l )
echo v129 $( grep -w v129 $FILE | wc -l )
echo v130 $( grep -w v130 $FILE | wc -l )
echo v131 $( grep -w v131 $FILE | wc -l )
echo v132 $( grep -w v132 $FILE | wc -l )
echo v133 $( grep -w v133 $FILE | wc -l )
echo v134 $( grep -w v134 $FILE | wc -l )
echo v135 $( grep -w v135 $FILE | wc -l )
echo v136 $( grep -w v136 $FILE | wc -l )
echo v137 $( grep -w v137 $FILE | wc -l )
echo v138 $( grep -w v138 $FILE | wc -l )
echo v139 $( grep -w v139 $FILE | wc -l )
echo v140 $( grep -w v140 $FILE | wc -l )
echo v141 $( grep -w v141 $FILE | wc -l )
echo v142 $( grep -w v142 $FILE | wc -l )
echo v143 $( grep -w v143 $FILE | wc -l )
echo v144 $( grep -w v144 $FILE | wc -l )
echo v145 $( grep -w v145 $FILE | wc -l )
echo v146 $( grep -w v146 $FILE | wc -l )
echo v147 $( grep -w v147 $FILE | wc -l )
echo v148 $( grep -w v148 $FILE | wc -l )
echo v149 $( grep -w v149 $FILE | wc -l )
echo v150 $( grep -w v150 $FILE | wc -l )
echo v151 $( grep -w v151 $FILE | wc -l )
echo v152 $( grep -w v152 $FILE | wc -l )
echo v153 $( grep -w v153 $FILE | wc -l )
echo v154 $( grep -w v154 $FILE | wc -l )
echo v155 $( grep -w v155 $FILE | wc -l )
echo v156 $( grep -w v156 $FILE | wc -l )
echo v157 $( grep -w v157 $FILE | wc -l )
echo v158 $( grep -w v158 $FILE | wc -l )
echo v159 $( grep -w v159 $FILE | wc -l )
echo v160 $( grep -w v160 $FILE | wc -l )
echo v161 $( grep -w v161 $FILE | wc -l )
echo v162 $( grep -w v162 $FILE | wc -l )
echo v163 $( grep -w v163 $FILE | wc -l )
echo v164 $( grep -w v164 $FILE | wc -l )
echo v165 $( grep -w v165 $FILE | wc -l )
echo v166 $( grep -w v166 $FILE | wc -l )
echo v167 $( grep -w v167 $FILE | wc -l )
echo v168 $( grep -w v168 $FILE | wc -l )
echo v169 $( grep -w v169 $FILE | wc -l )
echo v170 $( grep -w v170 $FILE | wc -l )
echo v171 $( grep -w v171 $FILE | wc -l )
echo v172 $( grep -w v172 $FILE | wc -l )
echo v173 $( grep -w v173 $FILE | wc -l )
echo v174 $( grep -w v174 $FILE | wc -l )
echo v175 $( grep -w v175 $FILE | wc -l )
echo v176 $( grep -w v176 $FILE | wc -l )
echo v177 $( grep -w v177 $FILE | wc -l )
echo v178 $( grep -w v178 $FILE | wc -l )
echo v179 $( grep -w v179 $FILE | wc -l )
echo v180 $( grep -w v180 $FILE | wc -l )
echo v181 $( grep -w v181 $FILE | wc -l )
echo v182 $( grep -w v182 $FILE | wc -l )
echo v183 $( grep -w v183 $FILE | wc -l )
echo v184 $( grep -w v184 $FILE | wc -l )
echo v185 $( grep -w v185 $FILE | wc -l )
echo v186 $( grep -w v186 $FILE | wc -l )
echo v187 $( grep -w v187 $FILE | wc -l )
echo v188 $( grep -w v188 $FILE | wc -l )
echo v189 $( grep -w v189 $FILE | wc -l )
echo v190 $( grep -w v190 $FILE | wc -l )
echo v191 $( grep -w v191 $FILE | wc -l )
echo v192 $( grep -w v192 $FILE | wc -l )
echo v193 $( grep -w v193 $FILE | wc -l )
echo v194 $( grep -w v194 $FILE | wc -l )
echo v195 $( grep -w v195 $FILE | wc -l )
echo v196 $( grep -w v196 $FILE | wc -l )
echo v197 $( grep -w v197 $FILE | wc -l )
echo v198 $( grep -w v198 $FILE | wc -l )
echo v199 $( grep -w v199 $FILE | wc -l )
echo v200 $( grep -w v200 $FILE | wc -l )
echo v201 $( grep -w v201 $FILE | wc -l )
echo v202 $( grep -w v202 $FILE | wc -l )
echo v203 $( grep -w v203 $FILE | wc -l )
echo v204 $( grep -w v204 $FILE | wc -l )
echo v205 $( grep -w v205 $FILE | wc -l )
echo v206 $( grep -w v206 $FILE | wc -l )
echo v207 $( grep -w v207 $FILE | wc -l )
echo v208 $( grep -w v208 $FILE | wc -l )
echo v209 $( grep -w v209 $FILE | wc -l )
echo v210 $( grep -w v210 $FILE | wc -l )
echo v211 $( grep -w v211 $FILE | wc -l )
echo v212 $( grep -w v212 $FILE | wc -l )
echo v213 $( grep -w v213 $FILE | wc -l )
echo v214 $( grep -w v214 $FILE | wc -l )
echo v215 $( grep -w v215 $FILE | wc -l )
echo v216 $( grep -w v216 $FILE | wc -l )
echo v217 $( grep -w v217 $FILE | wc -l )
echo v218 $( grep -w v218 $FILE | wc -l )
echo v219 $( grep -w v219 $FILE | wc -l )
echo v220 $( grep -w v220 $FILE | wc -l )
echo v221 $( grep -w v221 $FILE | wc -l )
echo v222 $( grep -w v222 $FILE | wc -l )
echo v223 $( grep -w v223 $FILE | wc -l )
echo v224 $( grep -w v224 $FILE | wc -l )
echo v225 $( grep -w v225 $FILE | wc -l )
echo v226 $( grep -w v226 $FILE | wc -l )
echo v227 $( grep -w v227 $FILE | wc -l )
echo v228 $( grep -w v228 $FILE | wc -l )
echo v229 $( grep -w v229 $FILE | wc -l )
echo v230 $( grep -w v230 $FILE | wc -l )
echo v231 $( grep -w v231 $FILE | wc -l )
echo v232 $( grep -w v232 $FILE | wc -l )
echo v233 $( grep -w v233 $FILE | wc -l )
echo v234 $( grep -w v234 $FILE | wc -l )
echo v235 $( grep -w v235 $FILE | wc -l )
echo v236 $( grep -w v236 $FILE | wc -l )
echo v237 $( grep -w v237 $FILE | wc -l )
echo v238 $( grep -w v238 $FILE | wc -l )
echo v239 $( grep -w v239 $FILE | wc -l )
echo v240 $( grep -w v240 $FILE | wc -l )
echo v241 $( grep -w v241 $FILE | wc -l )
echo v242 $( grep -w v242 $FILE | wc -l )
echo v243 $( grep -w v243 $FILE | wc -l )
echo v244 $( grep -w v244 $FILE | wc -l )
echo v245 $( grep -w v245 $FILE | wc -l )
echo v246 $( grep -w v246 $FILE | wc -l )
echo v247 $( grep -w v247 $FILE | wc -l )
echo v248 $( grep -w v248 $FILE | wc -l )
echo v249 $( grep -w v249 $FILE | wc -l )
echo v250 $( grep -w v250 $FILE | wc -l )
echo v251 $( grep -w v251 $FILE | wc -l )
echo v252 $( grep -w v252 $FILE | wc -l )
echo v253 $( grep -w v253 $FILE | wc -l )
echo v254 $( grep -w v254 $FILE | wc -l )
echo v255 $( grep -w v255 $FILE | wc -l )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment