Commit 0aa899aa authored by Jehandad Khan's avatar Jehandad Khan
Browse files

add hipEvent based timing to kernels

parent 44757d6b
......@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
option(CK_TIME_KERNELS "Time every kernel and log parameters" OFF)
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
......@@ -227,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
......
cmake_minimum_required(VERSION 3.15)
project(ck_app)
add_compile_options(-std=c++14)
# add_link_options(--offload-arch=gfx908)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED True)
#set(CMAKE_CXX_STANDARD 14)
#set(CMAKE_CXX_STANDARD_REQUIRED True)
find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
find_package(hip REQUIRED PATHS /opt/rocm)
......
......@@ -28,6 +28,36 @@ enum ConvOutputLayout
NHWK, // 1
};
// Code to check CUDA errors
void check_cuda_error(void)
{
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
std::cerr
<< "Error: "
<< hipGetErrorString(err)
<< std::endl;
exit(err);
}
}
std::string getDeviceName(int device)
{
struct hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, device);
check_cuda_error();
return std::string(prop.name);
}
int getDriver(void)
{
int driver;
hipDriverGetVersion(&driver);
check_cuda_error();
return driver;
}
namespace ck {
namespace app {
......@@ -127,6 +157,14 @@ void profile_conv_fwd_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int deviceIndex = 0;
hipSetDevice(deviceIndex);
check_cuda_error();
hipStream_t stream_id = nullptr;
hipStreamCreate(&stream_id);
check_cuda_error();
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
......@@ -151,8 +189,7 @@ void profile_conv_fwd_impl(int do_verification,
if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr.GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat, nullptr);
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat, stream_id, true);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
......
......@@ -22,6 +22,7 @@ function(add_example_executable EXAMPLE_NAME)
message("adding example ${EXAMPLE_NAME}")
add_executable(${EXAMPLE_NAME} ${ARGN})
target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
set_target_properties(${EXAMPLE_NAME} PROPERTIES EXCLUDE_FROM_ALL 1)
add_dependencies(examples ${EXAMPLE_NAME})
endfunction(add_example_executable EXAMPLE_NAME)
......
#pragma once
// "_PACKAGE_" to avoid name contentions: the macros like
// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
// clang-format off
#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
// clang-format on
#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
#define CK_HIP_PACKAGE_VERSION_MAJOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_MINOR
#define CK_HIP_PACKAGE_VERSION_MINOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_PATCH
#define CK_HIP_PACKAGE_VERSION_PATCH 0
#endif
// 3 decimal digits for major and minor, 6 digits for patch number.
// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
CK_HIP_PACKAGE_VERSION_PATCH > 999999
#error "Too big HIP version number(s)"
#endif
#define CK_HIP_PACKAGE_VERSION_FLAT \
((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
CK_HIP_PACKAGE_VERSION_PATCH)
#pragma once
#cmakedefine01 CK_TIME_KERNELS
\ No newline at end of file
......@@ -22,7 +22,7 @@ struct BaseInvoker
BaseInvoker(const BaseInvoker&) = default;
BaseInvoker& operator=(const BaseInvoker&) = default;
virtual float Run(const BaseArgument*, int = 1, hipStream_t = nullptr){return -1;}
virtual float Run(const BaseArgument*, int = 1, hipStream_t = nullptr, bool = false){return -1;}
virtual ~BaseInvoker() {}
};
......
......@@ -274,7 +274,7 @@ struct DeviceBatchedGemmXdl
{
using Argument = DeviceBatchedGemmXdl::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_g_k0_m_k1_{"
......@@ -336,6 +336,7 @@ struct DeviceBatchedGemmXdl
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -369,6 +370,7 @@ struct DeviceBatchedGemmXdl
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -385,9 +387,9 @@ struct DeviceBatchedGemmXdl
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -414,7 +414,7 @@ struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
ShowInfo(arg);
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
......@@ -445,6 +445,7 @@ struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -561,9 +562,9 @@ struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -521,7 +521,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
nrepeat = 1;
float ave_time = 0;
......@@ -600,6 +600,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -634,6 +635,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -649,9 +651,9 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -642,7 +642,7 @@ struct
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
#if 0
{
......@@ -734,6 +734,7 @@ struct
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -779,6 +780,7 @@ struct
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -798,9 +800,9 @@ struct
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -607,7 +607,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
#if 0
{
......@@ -693,6 +693,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -733,6 +734,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -750,9 +752,9 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
#if 0
{
......@@ -670,6 +670,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -705,6 +706,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -720,9 +722,9 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
#if 0
{
......@@ -506,6 +506,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -538,6 +539,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -553,9 +555,9 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -98,7 +98,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto naive_conv3d_fwd =
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
......@@ -115,6 +115,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
dim3(256),
0,
stream_id,
measure_time,
arg.p_in_,
arg.p_wei_,
arg.p_out_,
......@@ -144,9 +145,9 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -430,7 +430,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
......@@ -485,6 +485,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -521,6 +522,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -541,9 +543,9 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -591,7 +591,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -645,6 +645,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -677,6 +678,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -692,9 +694,9 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -290,7 +290,7 @@ struct DeviceGemmXdl
{
using Argument = DeviceGemmXdl::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -344,6 +344,7 @@ struct DeviceGemmXdl
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -376,6 +377,7 @@ struct DeviceGemmXdl
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -392,9 +394,9 @@ struct DeviceGemmXdl
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -249,7 +249,7 @@ struct DeviceGemmXdl_C_Shuffle
{
using Argument = DeviceGemmXdl_C_Shuffle::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -306,6 +306,7 @@ struct DeviceGemmXdl_C_Shuffle
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -341,6 +342,7 @@ struct DeviceGemmXdl_C_Shuffle
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -357,9 +359,9 @@ struct DeviceGemmXdl_C_Shuffle
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -268,7 +268,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
{
using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -331,6 +331,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -371,6 +372,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -389,9 +391,9 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment