Commit 0aa899aa authored by Jehandad Khan's avatar Jehandad Khan
Browse files

add hipEvent based timing to kernels

parent 44757d6b
......@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -336,6 +336,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -376,6 +377,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -394,9 +396,9 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
{
using Argument = DeviceOp::Argument;
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
......@@ -381,6 +381,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -426,6 +427,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -446,9 +448,9 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -385,7 +385,7 @@ struct DeviceGemmXdlSplitK
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
......@@ -417,6 +417,7 @@ struct DeviceGemmXdlSplitK
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -533,9 +534,9 @@ struct DeviceGemmXdlSplitK
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -391,7 +391,7 @@ struct DeviceGemmXdlSplitKCShuffle
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
......@@ -424,6 +424,7 @@ struct DeviceGemmXdlSplitKCShuffle
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
......@@ -544,9 +545,9 @@ struct DeviceGemmXdlSplitKCShuffle
}
// polymorphic
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
OutDataType,
......@@ -247,6 +247,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
dim3(BlockSize),
0,
stream_id,
measure_time,
arg.a_grid_desc_m_k_,
arg.b_grid_desc_m_,
arg.in_element_op_,
......@@ -258,9 +259,9 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
arg.p_out_indices_dev_);
}
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
}
};
......
......@@ -198,7 +198,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto in_grid_desc_m_k =
DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
......@@ -246,6 +246,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
dim3(BlockSize),
0,
stream_id,
measure_time,
in_grid_desc_m_k,
out_grid_desc_m,
arg.in_elementwise_op_,
......@@ -260,9 +261,9 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
};
};
......
......@@ -175,7 +175,7 @@ struct DeviceReduceBlockWiseSecondCall
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_);
......@@ -223,6 +223,7 @@ struct DeviceReduceBlockWiseSecondCall
dim3(BlockSize),
0,
stream_id,
measure_time,
in_grid_desc_m_k,
out_grid_desc_m,
arg.in_elementwise_op_,
......@@ -237,9 +238,9 @@ struct DeviceReduceBlockWiseSecondCall
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
};
};
......
......@@ -234,7 +234,7 @@ struct DeviceReduceMultiBlockAtomicAdd
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool = false)
{
const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
......@@ -318,9 +318,9 @@ struct DeviceReduceMultiBlockAtomicAdd
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
};
};
......
......@@ -259,7 +259,7 @@ struct DeviceReduceMultiBlockPartialReduce
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
......@@ -305,6 +305,7 @@ struct DeviceReduceMultiBlockPartialReduce
dim3(BlockSize),
0,
stream_id,
measure_time,
in_grid_desc_m_k,
ws_desc_m_k,
arg.in_elementwise_op_,
......@@ -318,9 +319,9 @@ struct DeviceReduceMultiBlockPartialReduce
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
};
};
......
......@@ -198,7 +198,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr)
float Run(const Argument& arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false)
{
const auto in_grid_desc_m_k =
DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
......@@ -246,6 +246,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
dim3(BlockSize),
0,
stream_id,
measure_time,
in_grid_desc_m_k,
out_grid_desc_m,
arg.in_elementwise_op_,
......@@ -259,9 +260,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr) override
float Run(const BaseArgument* p_arg, int nrepeat = 1, hipStream_t stream_id = nullptr, bool measure_time = false) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id);
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat, stream_id, measure_time);
};
};
......
......@@ -28,8 +28,8 @@ struct DeviceConvFwdPtr_t
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads); // in,wei and out element ops are ignored for now since even if we change them, they cant be linked
std::unique_ptr<BaseInvoker> MakeInvokerPointer(); // requires including BaseInvoker headers
std::vector<ck::index_t> input_right_pads) const; // in,wei and out element ops are ignored for now since even if we change them, they cant be linked
std::unique_ptr<BaseInvoker> MakeInvokerPointer() const; // requires including BaseInvoker headers
std::string GetTypeString();
bool IsSupportedArgument(const BaseArgument* arg_ptr);
};
......
#ifndef DEVICE_HPP
#define DEVICE_HPP
#include "ck/options.hpp"
#include <memory>
#include <functional>
#include <thread>
......@@ -8,6 +10,39 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
inline void hip_check(hipError_t x)
{
if(x != hipSuccess)
throw std::runtime_error("Failed to run HIP call");
}
template<typename F, F f>
struct managed_deleter
{
template<typename T>
void operator()(T * t)
{
if(t != nullptr)
{
std::ignore = f(t);
}
}
};
template<typename T, typename F, F f>
using managed_pointer = std::unique_ptr<T, managed_deleter<F, f>>;
using hipEventPtr = managed_pointer<typename std::remove_pointer<hipEvent_t>::type, decltype(&hipEventDestroy), hipEventDestroy>;
inline hipEventPtr make_hip_event()
{
hipEvent_t result = nullptr;
hip_check(hipEventCreate(&result));
return hipEventPtr{result};
}
struct DeviceMem
{
DeviceMem() = delete;
......@@ -44,9 +79,9 @@ void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte
template <typename... Args, typename F>
float launch_and_time_kernel(
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, hipStream_t stream_id, Args... args)
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, hipStream_t stream_id, bool measure_time, Args... args)
{
#if 1
#if CK_TIME_KERNELS
KernelTimer timer;
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
......@@ -78,9 +113,27 @@ float launch_and_time_kernel(
return timer.GetElapsedTime() / nrepeat;
#else
std::ignore = nrepeat;
hipEventPtr start = nullptr;
hipEventPtr stop = nullptr;
float elapsed_time = 0.0f;
if(measure_time)
{
start = make_hip_event();
stop = make_hip_event();
hip_check(hipEventRecord(start.get(), stream_id));
}
launch_kernel(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
return 0;
if(measure_time)
{
hip_check(hipEventRecord(stop.get(), stream_id));
hip_check(hipEventSynchronize(stop.get()));
hip_check(hipEventElapsedTime(&elapsed_time, start.get(), stop.get()));
}
return elapsed_time;
#endif
}
#endif
......@@ -84,7 +84,7 @@ struct ReferenceBatchedGemm : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -114,7 +114,7 @@ struct ReferenceConvWrw : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -129,7 +129,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -171,7 +171,7 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -117,7 +117,7 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -123,7 +123,7 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -82,7 +82,7 @@ struct ReferenceGemm : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
......@@ -82,7 +82,7 @@ struct ReferenceGemmBias2D : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int, hipStream_t) override
float Run(const device::BaseArgument* p_arg, int, hipStream_t, bool) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment