fixed and formatted code

3eee7eda · mtgu0705 · a7f24bfc · 3eee7eda · 3eee7eda · 3eee7eda
Commit 3eee7eda authored Jan 02, 2025 by mtgu0705
8 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -6,16 +6,17 @@
 #include <iostream>
 #include <sstream>
-#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/common_header.hpp"
 #include "ck/host_utility/flush_cache.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
-#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -3,11 +3,11 @@
 #pragma once
-#include "ck/utility/amd_inline_asm.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/utility/amd_inline_asm.hpp"
 #include <cassert>
 namespace ck {
@@ -254,11 +254,11 @@ struct UnaryOpBase
    public:
    __host__ __device__ ~UnaryOpBase() = default;
-    __host__ __device__ constexpr UnaryOpBase()                    = default;
+    __host__ __device__ constexpr UnaryOpBase()                   = default;
-    __host__ __device__ constexpr UnaryOpBase(const UnaryOpBase&)  = default;
+    __host__ __device__ constexpr UnaryOpBase(const UnaryOpBase&) = default;
-    __host__ __device__ constexpr UnaryOpBase(UnaryOpBase&&)       = default;
+    __host__ __device__ constexpr UnaryOpBase(UnaryOpBase&&)      = default;
    __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default;
-    __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&)      = default;
+    __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default;
    __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0;
@@ -306,12 +306,12 @@ struct PassThroughPack2
 struct PassThrough final : public UnaryOpBase
 {
-    __host__ __device__ constexpr PassThrough()                    = default;
+    __host__ __device__ constexpr PassThrough()                   = default;
-    __host__ __device__ constexpr PassThrough(const PassThrough&)  = default;
+    __host__ __device__ constexpr PassThrough(const PassThrough&) = default;
-    __host__ __device__ constexpr PassThrough(PassThrough&&)       = default;
+    __host__ __device__ constexpr PassThrough(PassThrough&&)      = default;
    __host__ __device__ PassThrough& operator=(const PassThrough&) = default;
-    __host__ __device__ PassThrough& operator=(PassThrough&&)      = default;
+    __host__ __device__ PassThrough& operator=(PassThrough&&) = default;
-    __host__ __device__ ~PassThrough()                             = default;
+    __host__ __device__ ~PassThrough()                        = default;
    __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; }
@@ -677,12 +677,12 @@ struct UnarySquare
 struct UnaryAbs final : public UnaryOpBase
 {
-    __host__ __device__ constexpr UnaryAbs()                 = default;
+    __host__ __device__ constexpr UnaryAbs()                = default;
-    __host__ __device__ constexpr UnaryAbs(const UnaryAbs&)  = default;
+    __host__ __device__ constexpr UnaryAbs(const UnaryAbs&) = default;
-    __host__ __device__ constexpr UnaryAbs(UnaryAbs&&)       = default;
+    __host__ __device__ constexpr UnaryAbs(UnaryAbs&&)      = default;
    __host__ __device__ UnaryAbs& operator=(const UnaryAbs&) = default;
-    __host__ __device__ UnaryAbs& operator=(UnaryAbs&&)      = default;
+    __host__ __device__ UnaryAbs& operator=(UnaryAbs&&) = default;
-    __host__ __device__ ~UnaryAbs()                          = default;
+    __host__ __device__ ~UnaryAbs()                     = default;
    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
@@ -734,12 +734,12 @@ struct UnarySqrt
 struct Relu final : public UnaryOpBase
 {
-    __host__ __device__ constexpr Relu()             = default;
+    __host__ __device__ constexpr Relu()            = default;
-    __host__ __device__ constexpr Relu(const Relu&)  = default;
+    __host__ __device__ constexpr Relu(const Relu&) = default;
-    __host__ __device__ constexpr Relu(Relu&&)       = default;
+    __host__ __device__ constexpr Relu(Relu&&)      = default;
    __host__ __device__ Relu& operator=(const Relu&) = default;
-    __host__ __device__ Relu& operator=(Relu&&)      = default;
+    __host__ __device__ Relu& operator=(Relu&&) = default;
-    __host__ __device__ ~Relu()                      = default;
+    __host__ __device__ ~Relu()                 = default;
    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
@@ -915,12 +915,12 @@ struct Gelu
 struct Sigmoid final : public UnaryOpBase
 {
-    __host__ __device__ constexpr Sigmoid()                = default;
+    __host__ __device__ constexpr Sigmoid()               = default;
-    __host__ __device__ constexpr Sigmoid(const Sigmoid&)  = default;
+    __host__ __device__ constexpr Sigmoid(const Sigmoid&) = default;
-    __host__ __device__ constexpr Sigmoid(Sigmoid&&)       = default;
+    __host__ __device__ constexpr Sigmoid(Sigmoid&&)      = default;
    __host__ __device__ Sigmoid& operator=(const Sigmoid&) = default;
-    __host__ __device__ Sigmoid& operator=(Sigmoid&&)      = default;
+    __host__ __device__ Sigmoid& operator=(Sigmoid&&) = default;
-    __host__ __device__ ~Sigmoid()                         = default;
+    __host__ __device__ ~Sigmoid()                    = default;
    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
@@ -976,12 +976,12 @@ struct Silu
 struct TanH final : public UnaryOpBase
 {
-    __host__ __device__ constexpr TanH()             = default;
+    __host__ __device__ constexpr TanH()            = default;
-    __host__ __device__ constexpr TanH(const TanH&)  = default;
+    __host__ __device__ constexpr TanH(const TanH&) = default;
-    __host__ __device__ constexpr TanH(TanH&&)       = default;
+    __host__ __device__ constexpr TanH(TanH&&)      = default;
    __host__ __device__ TanH& operator=(const TanH&) = default;
-    __host__ __device__ TanH& operator=(TanH&&)      = default;
+    __host__ __device__ TanH& operator=(TanH&&) = default;
-    __host__ __device__ ~TanH()                      = default;
+    __host__ __device__ ~TanH()                 = default;
    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -29,7 +29,7 @@ template <typename GridwiseGemm,
          TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -59,7 +59,7 @@ template <typename GridwiseGemm,
          TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -3,10 +3,10 @@
 #pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/tensor_space_filling_curve.hpp"
-#include "ck/utility/common_header.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -560,7 +560,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
            });
-            constexpr auto move_on_dim = [&]() constexpr {
+            constexpr auto move_on_dim = [&]() constexpr
+            {
                StaticallyIndexedArray<bool, nDim> move_on_dim_;
                static_for<0, nDim, 1>{}([&](auto i) {
@@ -573,7 +574,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                });
                return move_on_dim_;
-            }();
+            }
+            ();
            // move
            static_for<0, nDim, 1>{}([&](auto i) {
@@ -718,7 +720,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                is_dst_valid,
                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-            constexpr auto move_on_dim = [&]() constexpr {
+            constexpr auto move_on_dim = [&]() constexpr
+            {
                StaticallyIndexedArray<bool, nDim> move_on_dim_;
                static_for<0, nDim, 1>{}([&](auto i) {
@@ -731,7 +734,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                });
                return move_on_dim_;
-            }();
+            }
+            ();
            // move
            static_for<0, nDim, 1>{}([&](auto i) {

--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -50,11 +50,9 @@ using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
        //Compute friendly
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   224,  256,   128,   8,   32,  16,   16,    7,    8,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, true>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   224,  256,   128,   8,   32,  16,   16,    7,    8,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, true>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
        //Latency friendly
@@ -80,8 +78,6 @@ using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
        // Memory friendly v4
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   32,   64,    8,   32,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<2, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, true>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, true>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
@@ -101,25 +97,6 @@ using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
        //new Memory friendly kernel
        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   16,    64,   256,   8,   32,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 4, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 4, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   256,   8,   32,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // // Memory friendly v3
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   256,   8,   32,  32,   32,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   256,   8,   16,  16,   16,    2,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   256,   8,   16,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   256,   8,   32,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   64,   256,   8,   32,  16,   16,    1,    2,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   64,   256,   8,   32,  32,   32,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,  128,   256,   8,   32,  16,   16,    1,    4,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,  128,   256,   8,   32,  32,   32,    1,    2,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    16,  256,   256,   8,   32,  16,   16,    1,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   256,   8,   32,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>
    // clang-format on
    >;
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -23,11 +23,10 @@ void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
 {
    add_device_operation_instances(
        instances,
-        // device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
        device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
 }
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
\ No newline at end of file
--- a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
@@ -159,7 +159,7 @@ bool profile_gemm_b_scale_impl(int do_verification,
        {
            for(int k = 0; k < K; k++)
            {
-                ck::pk_i4_t i4x2 = b_k_n(k, n);
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
                int8_t i4        = 0;
                if(k % 2 == 1)
                    i4 = (i4x2 >> 0) & 0xf;
@@ -227,7 +227,7 @@ bool profile_gemm_b_scale_impl(int do_verification,
                    for(int k = 0; k < 4; k++)
                    {
-                        int i4x2         = b_k_n_permute(j + k * 2, i);
+                        int i4x2         = b_k_n_permute(j + k * 2, i).data;
                        input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
                        input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
                    }

--- a/profiler/src/profile_gemm_b_scale.cpp
+++ b/profiler/src/profile_gemm_b_scale.cpp
@@ -165,12 +165,6 @@ int profile_gemm_b_scale(int argc, char* argv[])
        return pass ? 0 : 1;
    };
-    // if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN &&
-    // B_scale_block == BScaleBlockTile::K_64)
-    // {
-    //     return profile(F16{}, I4{}, F16{}, F16{}, F32{}, F16{}, ck::Number<64>{}, Row{}, Col{},
-    //     Row{});
-    // }
    if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN &&
       B_scale_block == BScaleBlockTile::K_128)
    {