format some files

8b83b087 · mtgu0705 · f9435a75 · 8b83b087 · 8b83b087 · 8b83b087
Commit 8b83b087 authored Jan 02, 2025 by mtgu0705
9 changed files
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -45,13 +45,6 @@ using DeviceGemmV2Instance =

 // clang-format on

-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                        BDataType,
-                                                                        CDataType,
-                                                                        AccDataType,
-                                                                        PassThrough,
-                                                                        PassThrough,
-                                                                        PassThrough>;
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                        BDataType,
                                                                        CDataType,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -6,16 +6,16 @@
 #include <iostream>
 #include <sstream>

-#include "ck/utility/common_header.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/host_utility/flush_cache.hpp"
+#include "ck/utility/common_header.hpp"

 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -41,74 +41,42 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));

-    asm volatile("v_pk_mul_f16 %0, %1, %2"
-                 : "=v"(res.template AsType<half2_t>()(Number<0>{}))
-                 : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
-
-    asm volatile("v_pk_mul_f16 %0, %1, %2"
-                 : "=v"(res.template AsType<half2_t>()(Number<1>{}))
-                 : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
-
    return res.template AsType<half4_t>()[Number<0>{}];
 }

-// Further fuse the scale into inline assembly, sanity failed
-#if 0
-__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half_t& scale)
+__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t& scale)
 {
-    constexpr int LO = 0x000f000f;
-    constexpr int HI = 0x00f000f0;
-    constexpr int EX = 0x64006400;
-    // Guarantee that the `(a & b) | c` operations are LOP3s.
-    // int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-    // int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    // Extract the two int4 at low bit and create two fp16 number.
    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
    int hi = amd_assembly_and_or_b32(q, HI, EX);
-    // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-    // directly into `SUB` and `ADD`.
-    // constexpr int SUB = 0xE408E408; //-8
-    // constexpr int MUL = 0x2c002c00; // 1/16
-    // constexpr int ADD = 0xd480d480; //-79
-    constexpr half_t SUB = bit_cast<half_t>(static_cast<uint16_t>(0xE408));
-    constexpr half_t MUL = bit_cast<half_t>(static_cast<uint16_t>(0x2c00));
-    constexpr half_t ADD = bit_cast<half_t>(static_cast<uint16_t>(0xd480));
-
-    vector_type<half_t, 2> scale_2;
-    scale_2.template AsType<half_t>()(Number<0>{}) = scale;
-    scale_2.template AsType<half_t>()(Number<1>{}) = scale;
-    vector_type<half_t, 2> sub_2;
-    sub_2.template AsType<half_t>()(Number<0>{}) = SUB * scale;
-    sub_2.template AsType<half_t>()(Number<1>{}) = SUB * scale;
-    vector_type<half_t, 2> mul_2;
-    mul_2.template AsType<half_t>()(Number<0>{}) = MUL * scale;
-    mul_2.template AsType<half_t>()(Number<1>{}) = MUL * scale;
-    vector_type<half_t, 2> add_2;
-    add_2.template AsType<half_t>()(Number<0>{}) = ADD * scale;
-    add_2.template AsType<half_t>()(Number<1>{}) = ADD * scale;
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}

    vector_type<half_t, 4> res;

    res.template AsType<half2_t>()(Number<0>{}) =
-        amd_assembly_pk_fma_f16(bit_cast<half2_t>(lo),
-                                scale_2.template AsType<half2_t>()(Number<0>{}),
-                                sub_2.template AsType<half2_t>()(Number<0>{}));
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));

-    res.template AsType<half2_t>()(Number<1>{}) =
-        amd_assembly_pk_fma_f16(bit_cast<half2_t>(hi),
-                                mul_2.template AsType<half2_t>()(Number<0>{}),
-                                add_2.template AsType<half2_t>()(Number<0>{}));
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));

-    // asm volatile("v_pk_mul_f16 %0, %1, %2"
-    //              : "=v"(res.template AsType<half2_t>()(Number<0>{}))
-    //              : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<0>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));

-    // asm volatile("v_pk_mul_f16 %0, %1, %2"
-    //              : "=v"(res.template AsType<half2_t>()(Number<1>{}))
-    //              : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<1>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));

    return res.template AsType<half4_t>()[Number<0>{}];
 }
-#endif

 __host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q)
 {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -3,16 +3,16 @@

 #pragma once

-#include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
-#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"

 namespace ck {

@@ -29,7 +29,7 @@ template <typename GridwiseGemm,
          TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -59,7 +59,7 @@ template <typename GridwiseGemm,
          TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)

--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -4,8 +4,8 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP

-#include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
+#include "data_type.hpp"

 // TODO: deprecate all amd_assembly_outer_product_xxx


--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
@@ -3,12 +3,12 @@

 #pragma once

-#include <vector>
-#include <memory>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include <memory>
+#include <vector>

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"


--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -2,9 +2,9 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"


--- a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
@@ -8,18 +8,18 @@
 #include <typeinfo>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp"

+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 namespace ck {
 namespace profiler {

--- a/profiler/src/profile_gemm_b_scale.cpp
+++ b/profiler/src/profile_gemm_b_scale.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>

 #include "profiler/profile_gemm_b_scale_impl.hpp"
 #include "profiler_operation_registry.hpp"