double __restrict__ lds pointer w/a success

double restrict lds pointer w/a success
c47159b0 · aska-0096 · a6ba3d2a · c47159b0 · c47159b0 · c47159b0
Commit c47159b0 authored Dec 01, 2023 by aska-0096
9 changed files
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -50,7 +50,7 @@ struct ExecutionConfig final
 {
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = false;
+    int time_kernel     = 0;
 };

 template <ck::index_t... Is>

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -162,7 +162,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
            return true;
        }
 #endif
-        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 300, 3000});

        std::size_t flop = 2_uz * M * N * K;
        std::size_t num_btype =
@@ -178,7 +178,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

        float gb_per_sec = num_btype / 1.E6 / ave_time;

-        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+        if(config.time_kernel==1)
+        std::cout << "Perf mode Mean: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+        else if(config.time_kernel==2)
+        std::cout << "Perf mode Median: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                  << " GB/s, " << gemm.GetTypeString() << std::endl;
    });


--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -30,7 +30,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
               block_dim.y,
               block_dim.z);

-        printf("Warm up 1 time\n");
+        printf("Warm up %d time\n", stream_config.cold_niters_);
 #endif
        // warm up
        for(int i = 0; i < stream_config.cold_niters_; ++i)
@@ -48,23 +48,39 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        hip_check_error(hipEventCreate(&start));
        hip_check_error(hipEventCreate(&stop));

-        hip_check_error(hipDeviceSynchronize());
-        hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+        std::vector<float> execution_time_series;

        for(int i = 0; i < nrepeat; ++i)
        {
+            float execution_time = 0;
+            hip_check_error(hipDeviceSynchronize());
+            hip_check_error(hipEventRecord(start, stream_config.stream_id_));
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
            hip_check_error(hipGetLastError());
+            hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+            hip_check_error(hipEventSynchronize(stop));
+            hip_check_error(hipEventElapsedTime(&execution_time, start, stop));
+            execution_time_series.push_back(execution_time);
        }

-        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
-        hip_check_error(hipEventSynchronize(stop));
+        float mean_execution_time = 0;
+        float median_execution_time = 0;

-        float total_time = 0;
+#if DEBUG_LOG
+        for(int i =0; i<nrepeat; i++){
+           std::cout<<i<<" th launch, execution time = "<<execution_time_series[i]<<" ms"<<std::endl;
+        }
+#endif

-        hip_check_error(hipEventElapsedTime(&total_time, start, stop));
+        std::sort(execution_time_series.begin(),execution_time_series.end());

-        return total_time / nrepeat;
+        mean_execution_time = std::reduce(execution_time_series.begin(), execution_time_series.end(), .0)/static_cast<float>(nrepeat);
+        median_execution_time = execution_time_series[execution_time_series.size()/2];
+
+        if(stream_config.time_kernel_==1)
+        return mean_execution_time;
+        else
+        return median_execution_time;
    }
    else
    {

--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -9,7 +9,7 @@
 struct StreamConfig
 {
    hipStream_t stream_id_ = nullptr;
-    bool time_kernel_      = false;
+    int time_kernel_      = 0;
    int log_level_         = 0;
    int cold_niters_       = 50;
    int nrepeat_           = 200;

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -123,7 +123,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1

    template <typename DstBuffer, index_t ThreadScratchId = 0>
    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& __restrict__ dst_buf,
+                             DstBuffer& dst_buf,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1_doublelds.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1_doublelds.hpp
@@ -30,6 +30,7 @@ __global__ void
    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];

+
    GridwiseGemm::template Run<HasMainKBlockLoop>(
        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
 #else
@@ -46,9 +47,9 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdl_cshuffle_v1(const FloatA* __restrict__ p_a_grid,
-                                    const FloatB* __restrict__ p_b_grid,
-                                    FloatC* __restrict__ p_c_grid,
+        kernel_gemm_xdl_cshuffle_v1(const FloatA* p_a_grid,
+                                    const FloatB* p_b_grid,
+                                    FloatC* p_c_grid,
                                    typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
@@ -706,11 +707,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;

    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatA* __restrict__ p_a_grid,
-                               const FloatB* __restrict__ p_b_grid,
-                               FloatC* __restrict__ p_c_grid,
-                               void* __restrict__ p_shared_0,
-                               void* __restrict__ p_shared_1,
+    __device__ static void Run(const FloatA* p_a_grid,
+                               const FloatB* p_b_grid,
+                               FloatC* p_c_grid,
+                               void* p_shared_0,
+                               void* p_shared_1,
                               const Problem& problem)
    {
        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
@@ -872,17 +873,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);

        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeTypeA* __restrict__>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ComputeTypeA*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());

        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeTypeB* __restrict__>(p_shared_0) + a_block_space_size_aligned,
+            static_cast<ComputeTypeB*>(p_shared_0) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeTypeA* __restrict__>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ComputeTypeA*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());

        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeTypeB* __restrict__>(p_shared_1) + a_block_space_size_aligned,
+            static_cast<ComputeTypeB*>(p_shared_1) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1050,7 +1050,7 @@ struct ThreadwiseTensorSliceTransfer_v4
              typename DstBuffer>
    __device__ void Run(const SrcDesc&,
                        const SrcRefToOriginDisplacement&,
-                        const SrcBuffer& __restrict__ src_buf,
+                        const SrcBuffer& src_buf,
                        const DstDesc&,
                        const DstOriginIdx&,
                        DstBuffer& dst_buf) const

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1

    template <typename DstBuffer, index_t ThreadScratchId = 0>
    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& __restrict__ dst_buf,
+                             DstBuffer& dst_buf,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
    {
        // if there is transpose, it's done here

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -25,16 +25,16 @@ struct DynamicBuffer
 {
    using type = T;

-    T* __restrict__ p_data_;
+    T* p_data_;
    ElementSpaceSize element_space_size_;
    T invalid_element_value_ = T{0};

-    __host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data, ElementSpaceSize element_space_size)
+    __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
        : p_data_{p_data}, element_space_size_{element_space_size}
    {
    }

-    __host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data,
+    __host__ __device__ constexpr DynamicBuffer(T* p_data,
                                                ElementSpaceSize element_space_size,
                                                T invalid_element_value)
        : p_data_{p_data},
@@ -410,7 +410,7 @@ template <AddressSpaceEnum BufferAddressSpace,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
          typename T,
          typename ElementSpaceSize>
-__host__ __device__ constexpr auto make_dynamic_buffer(T* __restrict__ p, ElementSpaceSize element_space_size)
+__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence>{
        p, element_space_size};
@@ -424,7 +424,7 @@ template <
    typename X,
    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
 __host__ __device__ constexpr auto
-make_dynamic_buffer(T* __restrict__ p, ElementSpaceSize element_space_size, X invalid_element_value)
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
 {
    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
        p, element_space_size, invalid_element_value};