clang-format

83d926dc · aska-0096 · 6c1aa33a · 83d926dc · 83d926dc · 83d926dc
Commit 83d926dc authored Jun 15, 2023 by aska-0096
3 changed files
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
@@ -49,32 +49,32 @@ using DeviceConvFwdInstance =
        InElementOp,
        WeiElementOp,
        OutElementOp,
-        ConvSpec,   // ConvForwardSpecialization
-        GemmSpec,   // GemmSpecialization
-        1,          // Prefetch stage
-        128,        // BlockSize
-        64,         // MPerBlock
-        64,         // NPerBlock
-        64,         // KPerBlock
-        8,          // K1
-        16,         // MPerWMMA
-        16,         // NPerWMMA
-        4,          // MRepeat
-        1,          // NRepeat
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           // Prefetch stage
+        128,         // BlockSize
+        64,          // MPerBlock
+        64,          // NPerBlock
+        64,          // KPerBlock
+        8,           // K1
+        16,          // MPerWMMA
+        16,          // NPerWMMA
+        4,           // MRepeat
+        1,           // NRepeat
        S<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>, // ABlockTransferSrcAccessOrder
-        2,          // ABlockTransferSrcVectorDim
-        8,          // ABlockTransferSrcScalarPerVector
-        8,          // ABlockTransferDstScalarPerVector_AK1
-        true,       // ABlockLdsExtraM
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        true,        // ABlockLdsExtraM
        S<4, 32, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>, // BBlockTransferSrcAccessOrder
-        2,          // BBlockTransferSrcVectorDim
-        8,          // BBlockTransferSrcScalarPerVector
-        8,          // BBlockTransferDstScalarPerVector_BK1
-        true,       // BBlockLdsExtraN
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        true,        // BBlockLdsExtraN
        1,
        1,
        S<1, 16, 1, 8>,
@@ -279,8 +279,9 @@ bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
    switch(conv_param.num_dim_spatial_)
    {
    // case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
-    case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
-    // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
+    case 2:
+        return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
+        // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
    }

    return false;

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
@@ -67,9 +67,8 @@ static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecial
 static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
 static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;

-using  DeviceMHAFactory = 
-std::tuple<
-    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
+using DeviceMHAFactory =
+    std::tuple<ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
        NumDimG,
        NumDimM,
        NumDimN,
@@ -138,8 +137,8 @@ std::tuple<
        2,              // CShuffleNWmmaPerWavePerShuffle
        S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
-        MaskingSpec>   // MaskingSpecialization
-    >;
+        MaskingSpec>    // MaskingSpecialization
+               >;
 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                                                B0DataType,

--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -220,8 +220,8 @@ amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0
                   "0"(c0),
                   "1"(c1));
 #else
-    c0     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
-    c1     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
 #endif
 }

@@ -257,10 +257,10 @@ __device__ void amd_assembly_outer_product_1x4(int8x4_t a,
                   "2"(c2),
                   "3"(c3));
 #else
-    c0     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
-    c1     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
-    c2     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
-    c3     = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
+    c2 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
+    c3 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
 #endif
 }