clang-format changes for pr881

39a1f853 · Harisankar Sadasivan · a20863b0 · 39a1f853 · 39a1f853 · 39a1f853
Commit 39a1f853 authored Sep 15, 2023 by Harisankar Sadasivan
5 changed files
--- a/example/53_gemv_splitk/gemv_splitk_fp16.cpp
+++ b/example/53_gemv_splitk/gemv_splitk_fp16.cpp
@@ -19,12 +19,10 @@ using CElementOp = PassThrough;
 static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+#define K1 8 // K1PerThread:2,4,8
+#define K0 4 // K0PerBlock:1,2,3,4...32
-#define K1 8 //K1PerThread:2,4,8
+#define N1 2 // Nperthread:2,4,8
-#define K0 4 //K0PerBlock:1,2,3,4...32 
+#define B 64 // block-size:64
-#define N1 2 //Nperthread:2,4,8
-#define B 64 //block-size:64
 // clang-format off
 using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/*

--- a/include/ck/tensor_operation/gpu/device/device_gemv.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemv.hpp
@@ -20,20 +20,19 @@ template <typename ALayout,
          typename CElementwiseOperation>
 struct DeviceGemv : public BaseOperator
 {
-    virtual std::unique_ptr<BaseArgument>
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
-    MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
-                        const void* p_b,
+                                                              void* p_c,
-                        void* p_c,
+                                                              ck::index_t M,
-                        ck::index_t M,
+                                                              ck::index_t N,
-                        ck::index_t N,
+                                                              ck::index_t K,
-                        ck::index_t K,
+                                                              ck::index_t StrideA,
-                        ck::index_t StrideA,
+                                                              ck::index_t StrideB,
-                        ck::index_t StrideB,
+                                                              ck::index_t StrideC,
-                        ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
-                        AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
-                        BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
-                        CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch = 1) = 0;
-                        ck::index_t KBatch=1) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
@@ -271,7 +271,7 @@ struct deviceGemvDl : public DeviceGemv<ALayout,
            return false;
        }
    }
-    // // 
+    // //
    // polymorphic
    bool IsSupportedArgument(const BaseArgument* p_arg) override
    {

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -658,14 +658,14 @@ struct BlockToCTileMap_3DGrid_KSplit
        return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x);
    }
-        //HS: Map 1D block-id to 3D tuple (M,N,K)
+    // HS: Map 1D block-id to 3D tuple (M,N,K)
    __host__ __device__ inline constexpr auto convert_1D_block_idx_to_3D_tuple(
        const index_t& block_1d_id, const index_t& N, const index_t& k_batch) const
    {
-        const auto Ndim= math::integer_divide_ceil(N, NPerBlock);
+        const auto Ndim = math::integer_divide_ceil(N, NPerBlock);
        return make_tuple(((block_1d_id) / (k_batch * Ndim)),
-                            (((block_1d_id) / k_batch) % Ndim),
+                          (((block_1d_id) / k_batch) % Ndim),
-                            (block_1d_id) % k_batch); // returns 3D tuple as (Mid,Nid,Kid)
+                          (block_1d_id) % k_batch); // returns 3D tuple as (Mid,Nid,Kid)
    }
    template <typename CTileIdx, typename CTileDim>

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp
@@ -27,12 +27,12 @@ template <typename GridwiseGemv,
          typename Block2CTileMap>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-    kernel_gemv_dl_v1r3(
+        kernel_gemv_dl_v1r3(
-        typename GridwiseGemv::Argument karg,
+            typename GridwiseGemv::Argument karg,
-        const Block2CTileMap& block_2_ctile_map) //: in __global__ functions, struct is
+            const Block2CTileMap& block_2_ctile_map) //: in __global__ functions, struct is
-                                                 // better for reduced load overhead
+                                                     // better for reduced load overhead
 {
    constexpr index_t shared_block_size =
        GridwiseGemv::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);