Commit c47159b0 authored by aska-0096's avatar aska-0096
Browse files

double __restrict__ lds pointer w/a success

parent a6ba3d2a
......@@ -50,7 +50,7 @@ struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
int time_kernel = 0;
};
template <ck::index_t... Is>
......
......@@ -162,7 +162,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
#endif
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 300, 3000});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
......@@ -178,7 +178,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
if(config.time_kernel==1)
std::cout << "Perf mode Mean: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
else if(config.time_kernel==2)
std::cout << "Perf mode Median: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
});
......
......@@ -30,7 +30,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
block_dim.y,
block_dim.z);
printf("Warm up 1 time\n");
printf("Warm up %d time\n", stream_config.cold_niters_);
#endif
// warm up
for(int i = 0; i < stream_config.cold_niters_; ++i)
......@@ -48,23 +48,39 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
hip_check_error(hipEventCreate(&start));
hip_check_error(hipEventCreate(&stop));
hip_check_error(hipDeviceSynchronize());
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
std::vector<float> execution_time_series;
for(int i = 0; i < nrepeat; ++i)
{
float execution_time = 0;
hip_check_error(hipDeviceSynchronize());
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
hip_check_error(hipEventSynchronize(stop));
hip_check_error(hipEventElapsedTime(&execution_time, start, stop));
execution_time_series.push_back(execution_time);
}
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
hip_check_error(hipEventSynchronize(stop));
float mean_execution_time = 0;
float median_execution_time = 0;
float total_time = 0;
#if DEBUG_LOG
for(int i =0; i<nrepeat; i++){
std::cout<<i<<" th launch, execution time = "<<execution_time_series[i]<<" ms"<<std::endl;
}
#endif
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
std::sort(execution_time_series.begin(),execution_time_series.end());
return total_time / nrepeat;
mean_execution_time = std::reduce(execution_time_series.begin(), execution_time_series.end(), .0)/static_cast<float>(nrepeat);
median_execution_time = execution_time_series[execution_time_series.size()/2];
if(stream_config.time_kernel_==1)
return mean_execution_time;
else
return median_execution_time;
}
else
{
......
......@@ -9,7 +9,7 @@
struct StreamConfig
{
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
int time_kernel_ = 0;
int log_level_ = 0;
int cold_niters_ = 50;
int nrepeat_ = 200;
......
......@@ -123,7 +123,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
template <typename DstBuffer, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDesc& dst_desc,
DstBuffer& __restrict__ dst_buf,
DstBuffer& dst_buf,
Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
{
if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
......
......@@ -30,6 +30,7 @@ __global__ void
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainKBlockLoop>(
karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
#else
......@@ -46,9 +47,9 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_xdl_cshuffle_v1(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
kernel_gemm_xdl_cshuffle_v1(const FloatA* p_a_grid,
const FloatB* p_b_grid,
FloatC* p_c_grid,
typename GridwiseGemm::Problem problem)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
......@@ -706,11 +707,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
template <bool HasMainKBlockLoop>
__device__ static void Run(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
void* __restrict__ p_shared_0,
void* __restrict__ p_shared_1,
__device__ static void Run(const FloatA* p_a_grid,
const FloatB* p_b_grid,
FloatC* p_c_grid,
void* p_shared_0,
void* p_shared_1,
const Problem& problem)
{
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
......@@ -872,17 +873,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeA* __restrict__>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
static_cast<ComputeTypeA*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeB* __restrict__>(p_shared_0) + a_block_space_size_aligned,
static_cast<ComputeTypeB*>(p_shared_0) + a_block_space_size_aligned,
b_block_desc_bk0_n_bk1.GetElementSpaceSize());
auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeA* __restrict__>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
static_cast<ComputeTypeA*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeB* __restrict__>(p_shared_1) + a_block_space_size_aligned,
static_cast<ComputeTypeB*>(p_shared_1) + a_block_space_size_aligned,
b_block_desc_bk0_n_bk1.GetElementSpaceSize());
auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
......
......@@ -1050,7 +1050,7 @@ struct ThreadwiseTensorSliceTransfer_v4
typename DstBuffer>
__device__ void Run(const SrcDesc&,
const SrcRefToOriginDisplacement&,
const SrcBuffer& __restrict__ src_buf,
const SrcBuffer& src_buf,
const DstDesc&,
const DstOriginIdx&,
DstBuffer& dst_buf) const
......
......@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
template <typename DstBuffer, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDesc& dst_desc,
DstBuffer& __restrict__ dst_buf,
DstBuffer& dst_buf,
Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
{
// if there is transpose, it's done here
......
......@@ -25,16 +25,16 @@ struct DynamicBuffer
{
using type = T;
T* __restrict__ p_data_;
T* p_data_;
ElementSpaceSize element_space_size_;
T invalid_element_value_ = T{0};
__host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data, ElementSpaceSize element_space_size)
__host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size}
{
}
__host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data,
__host__ __device__ constexpr DynamicBuffer(T* p_data,
ElementSpaceSize element_space_size,
T invalid_element_value)
: p_data_{p_data},
......@@ -410,7 +410,7 @@ template <AddressSpaceEnum BufferAddressSpace,
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
typename T,
typename ElementSpaceSize>
__host__ __device__ constexpr auto make_dynamic_buffer(T* __restrict__ p, ElementSpaceSize element_space_size)
__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence>{
p, element_space_size};
......@@ -424,7 +424,7 @@ template <
typename X,
typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
__host__ __device__ constexpr auto
make_dynamic_buffer(T* __restrict__ p, ElementSpaceSize element_space_size, X invalid_element_value)
make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
p, element_space_size, invalid_element_value};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment