debug code enabled

5cf73a5e · aska-0096 · 32bac6f3 · 5cf73a5e · 5cf73a5e · 5cf73a5e
Commit 5cf73a5e authored Aug 03, 2023 by aska-0096
13 changed files
--- a/example/49_fpAintB_gemm/common.hpp
+++ b/example/49_fpAintB_gemm/common.hpp
@@ -48,6 +48,40 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

+template <typename IntType>
+struct UnsignedWeightPreprocessor
+{
+};
+
+template <>
+struct UnsignedWeightPreprocessor<int8_t>
+{
+    using UnsignedWeight = Tensor<uint8_t>;
+    using SignedWeight   = Tensor<int8_t>;
+    static UnsignedWeight convert(SignedWeight const& Input)
+    {
+
+        UnsignedWeight Output = Input.template CopyAsType<uint8_t>();
+
+        auto f_kn = [&](auto k, auto n) {
+            const uint8_t adder = 128;
+            int8_t v_signed_weight;
+            uint8_t v_unsigned_weight;
+
+            ck::tensor_operation::element_wise::PassThrough{}(v_signed_weight, Input(k, n));
+            v_unsigned_weight = ck::type_convert<uint8_t>(v_signed_weight) + adder;
+            Output(k, n)      = v_unsigned_weight;
+        };
+
+        make_ParallelTensorFunctor(f_kn, Input.mDesc.GetLengths()[0], Input.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+
+        return Output;
+    }
+
+    UnsignedWeight operator()(SignedWeight const& Input) { return convert(Input); }
+};
+
 inline bool
 parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
 {

--- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
+++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
@@ -5,8 +5,18 @@

 #include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp"

+// Implementation follows the paper:
+// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. “Who Says Elephants Can’t Run:
+// Bringing Large Scale MoE Models into Cloud Scale Production.” arXiv, November 17, 2022.
+// https://doi.org/10.48550/arXiv.2211.10017. Assume weight (Matrix B) is add preprocess to
+// unsigned.
+
+// The DeviceOp is CDataType = ADataType * Dequant(BDataType) * ScaleDataType
+// The HostRef  is CDataType = ADataType * Dequant(QuantDataType) * ScaleDataType
+
 using ADataType        = ck::half_t;
-using BDataType        = int8_t;
+using QuantDataType    = int8_t;
+using BDataType        = uint8_t;
 using ScaleDataType    = ck::half_t;
 using AccDataType      = float;
 using CShuffleDataType = ck::half_t;
@@ -40,13 +50,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_
           1,           // Prefetch stage
           128,         // BlockSize
           128,         // MPerBlock
-           64,          // NPerBlock
+           128,          // NPerBlock
           64,          // KPerBlock
           8,           // K1
           16,          // MPerWmma
           16,          // NPerWmma
           4,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
-           2,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
           S<4, 32, 1>,     
           S<1, 0, 2>,     
           S<1, 0, 2>,              
@@ -68,7 +78,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferencefpAintBGemm<ADataType,
-                                                                               BDataType,
+                                                                               QuantDataType,
                                                                               ScaleDataType,
                                                                               CDataType,
                                                                               AccDataType,

--- a/example/49_fpAintB_gemm/run_gemm_example.inc
+++ b/example/49_fpAintB_gemm/run_gemm_example.inc
@@ -26,7 +26,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
        };

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<QuantDataType> quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    // assume scale tensor is [1, n]
    Tensor<ScaleDataType> scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{}));

@@ -35,35 +35,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    case 0: break;
    case 1:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-5.f, 5.f}(quant_b_k_n);
        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-5.f, 5.f}(scale_k_n);
        break;
    case 2:
        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
        ck::utils::FillUniformDistribution<ScaleDataType>{-1.f, 1.f}(scale_k_n);
        break;
    case 3:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-5.f, 5.f}(quant_b_k_n);
        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-5.f, 5.f}(scale_k_n);
        break;
    case 4:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{1.f, 1.f}(quant_b_k_n);
        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{2.f, 2.f}(scale_k_n);
        break;
    case 5:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-2.f, 2.f}(quant_b_k_n);
        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-2.f, 2.f}(scale_k_n);
        break;
    default:
        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
        ck::utils::FillUniformDistribution<ScaleDataType>{-1.f, 1.f}(scale_k_n);
    }

+    UnsignedWeightPreprocessor<QuantDataType> preprocessor;
+    Tensor<BDataType> b_k_n = preprocessor(quant_b_k_n);
+
 #if 0
    printf("Matrix A:\n");
    for (int im = 0; im < M; im++)
@@ -78,8 +81,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
        }
        printf("\n");
    }
-
-    printf("Matrix B:\n");
+#endif
+#if 0 
+    printf("Matrix QuantB:\n");
    for (int in = 0; in < N; in++)
    {
        for (int ik = 0; ik < K; ik++)
@@ -88,12 +92,29 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
                printf("|");
            }
            
-            printf(" %02x", b_k_n(ik,in));
+            printf(" %02x", *(reinterpret_cast<uint8_t*>(&quant_b_k_n(ik,in))));
        }
        printf("\n");
    }
-
+#endif
+#if 0
    printf("Matrix Scale:\n");
+    for(int in = 0; in < N; in++)
+    {
+        for(int ik = 0; ik < 1; ik++)
+        {
+            if(ik % 16 == 0)
+            {
+                printf("|");
+            }
+
+            printf(" %04x", *(reinterpret_cast<uint16_t*>(&scale_k_n(ik, in))));
+        }
+        printf("\n");
+    }
+#endif
+#if 0
+    printf("Matrix B:\n");
    for (int in = 0; in < N; in++)
    {
        for (int ik = 0; ik < K; ik++)
@@ -102,12 +123,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
                printf("|");
            }
            
-            printf(" %04x", *(reinterpret_cast<uint16_t*>(&scale_k_n(ik,in))));
+            printf(" %02x", b_k_n(ik,in));
        }
        printf("\n");
    }
-    #endif 
-    
+#endif
+
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

@@ -191,8 +212,13 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, scale_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
+                                                  quant_b_k_n,
+                                                  scale_k_n,
+                                                  c_m_n_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op);

        ref_invoker.Run(ref_argument);


--- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
@@ -309,8 +309,10 @@ struct Blockwise_fpAintB_GemmWMMA
            b_thread_desc_.GetElementSpaceSize());
        auto scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ScaleDataType>(
            scale_thread_desc_.GetElementSpaceSize());
-        auto converted_b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ADataType>(
-            b_thread_desc_.GetElementSpaceSize());
+        // auto converted_b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ADataType>(
+        // b_thread_desc_.GetElementSpaceSize());
+        tensor_operation::element_wise::FastNumericArrayConverter<BDataType, ADataType, WmmaK>
+            fast_numeric_converter;

        // basic intrinsic to determine loopover direction
        if constexpr(MRepeat < NRepeat)
@@ -345,15 +347,29 @@ struct Blockwise_fpAintB_GemmWMMA
                                make_tuple(I0, n0, I0, I0, I0, I0),
                                scale_thread_buf);

-                            // convert B from int8 to fp16, multiply scale
-                            static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) {
-                                converted_b_thread_buf(i) =
-                                    scale_thread_buf[i / WmmaK] *
-                                    type_convert<ADataType>(b_thread_buf[i]);
+                            vector_type<BDataType, WmmaK> b_int_vec;
+                            vector_type<ADataType, WmmaK> b_thread_vec;
+
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
+                                b_int_vec.template AsType<BDataType>()(i) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(i / B_K1 / B_KRow,
+                                                   n0,
+                                                   0,
+                                                   (i / B_K1) % B_KRow,
+                                                   0,
+                                                   i % B_K1))>{}];
+                            });
+
+                            // convert B from uint8 to fp16, multiply scale
+                            b_thread_vec = fast_numeric_converter(b_int_vec);
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
+                                b_thread_vec.template AsType<ADataType>()(i) =
+                                    scale_thread_buf[n0] *
+                                    b_thread_vec.template AsType<ADataType>()(i);
                            });

                            vector_type<ADataType, WmmaK> a_thread_vec;
-                            vector_type<ADataType, WmmaK> b_thread_vec;

                            static_for<0, WmmaK, 1>{}([&](auto i) {
                                a_thread_vec.template AsType<ADataType>()(i) =
@@ -364,14 +380,6 @@ struct Blockwise_fpAintB_GemmWMMA
                                                   (i / A_K1) % A_KRow,
                                                   0,
                                                   i % A_K1))>{}];
-                                b_thread_vec.template AsType<ADataType>()(i) =
-                                    converted_b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(i / B_K1 / B_KRow,
-                                                   n0,
-                                                   0,
-                                                   (i / B_K1) % B_KRow,
-                                                   0,
-                                                   i % B_K1))>{}];
                            });

                            using wmma_input_type_a = typename vector_type<ADataType, WmmaK>::type;
@@ -390,37 +398,48 @@ struct Blockwise_fpAintB_GemmWMMA
        }
        else
        {
-            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                // read weight scale
-                scale_thread_copy_.Run(
-                    scale_block_desc_1_n0_n1_n2_1,
-                    make_tuple(I0, n0, I0, I0, I0, I0),
-                    scale_block_buf,
-                    scale_thread_desc_,
-                    make_tuple(I0, n0, I0, I0, I0, I0),
-                    scale_thread_buf);
+            static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of
+                                                                // k=0,kpack*1, ..
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read weight scale
+                    scale_thread_copy_.Run(scale_block_desc_1_n0_n1_n2_1,
+                                           make_tuple(I0, n0, I0, I0, I0, I0),
+                                           scale_block_buf,
+                                           scale_thread_desc_,
+                                           make_tuple(I0, n0, I0, I0, I0, I0),
+                                           scale_thread_buf);
 #if 0
                    printf("Tid: %03d, n: %02d, scale_thread_buf: %04x\n",
                            get_thread_local_1d_id(), n0.value,
                            *(reinterpret_cast<const uint16_t*>(&scale_thread_buf[n0]))
                             );
 #endif
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of
-                                                                        // k=0,kpack*1, ..
-                        // read B
-                        b_thread_copy_.Run(
-                            b_block_desc_k0_n0_n1_n2_k1,
-                            make_tuple(Number<k * WmmaK / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                            b_block_buf,
-                            b_thread_desc_,
-                            make_tuple(I0, n0, I0, I0, I0, I0),
-                            b_thread_buf);                        
-                        // convert B from int8 to fp16, multiply scale
-                        static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) {
-                            converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] *
-                                                        type_convert<ADataType>(b_thread_buf[i]); // call byte permute
-                        });
+                    // read B
+                    b_thread_copy_.Run(
+                        b_block_desc_k0_n0_n1_n2_k1,
+                        make_tuple(Number<k * WmmaK / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                        b_block_buf,
+                        b_thread_desc_,
+                        make_tuple(I0, n0, I0, I0, I0, I0),
+                        b_thread_buf);
+
+                    vector_type<BDataType, WmmaK> b_int_vec;
+                    vector_type<ADataType, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                        b_int_vec.template AsType<BDataType>()(i) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
+                                i / B_K1 / B_KRow, n0, 0, (i / B_K1) % B_KRow, 0, i % B_K1))>{}];
+                    });
+
+                    // convert B from uint8 to fp16, multiply scale
+                    b_thread_vec = fast_numeric_converter(b_int_vec);
+                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                        b_thread_vec.template AsType<ADataType>()(i) =
+                            scale_thread_buf[n0] * b_thread_vec.template AsType<ADataType>()(i);
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
                        // read A
                        a_thread_copy_.Run(
                            a_block_desc_k0_m0_m1_m2_k1,
@@ -429,7 +448,8 @@ struct Blockwise_fpAintB_GemmWMMA
                            a_thread_desc_,
                            make_tuple(I0, m0, I0, I0, I0, I0),
                            a_thread_buf);
-                        if (true){
+                        if(true)
+                        {
 #if 0
                            printf("Tid: %03d, m, n, k: %02d, %02d, %02d, a_thread_buf: %04x %04x %04x %04x|  %04x %04x %04x %04x|  %04x %04x %04x %04x|  %04x %04x %04x %04x|\n",
                                    get_thread_local_1d_id(), m0.value, n0.value, k.value,
@@ -495,17 +515,8 @@ struct Blockwise_fpAintB_GemmWMMA
 #endif
                        }
                        vector_type<ADataType, WmmaK> a_thread_vec;
-                        vector_type<ADataType, WmmaK> b_thread_vec;

                        static_for<0, WmmaK, 1>{}([&](auto i) {
-                            b_thread_vec.template AsType<ADataType>()(i) =
-                                converted_b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(i / B_K1 / B_KRow,
-                                               n0,
-                                               0,
-                                               (i / B_K1) % B_KRow,
-                                               0,
-                                               i % B_K1))>{}];
                            a_thread_vec.template AsType<ADataType>()(i) =
                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                    make_tuple(i / A_K1 / A_KRow,
@@ -561,14 +572,10 @@ struct Blockwise_fpAintB_GemmWMMA
                                                Number<B_K1>{},
                                                Number<1>{}));

-    static constexpr auto scale_thread_desc_ =
-        make_naive_tensor_descriptor(make_tuple(Number<WmmaK / B_K1 / B_KRow>{},
-                                                Number<NRepeat>{},
-                                                I1,
-                                                Number<B_KRow>{},
-                                                I1,
-                                                I1),
-                                     make_tuple(I0, I1, I0, I0, I0, I0));
+    static constexpr auto scale_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(
+            Number<WmmaK / B_K1 / B_KRow>{}, Number<NRepeat>{}, I1, Number<B_KRow>{}, I1, I1),
+        make_tuple(I0, I1, I0, I0, I0, I0));

    // C[M, N, NumRegWMMA]
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(

--- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
@@ -95,8 +95,9 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
    static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true;

    // If true, LDS is used unconditionally
-    static constexpr auto AEnableLds_manu = false;
-    static constexpr auto BEnableLds_manu = false;
+    // LDS bypass feature not checked.
+    static constexpr auto AEnableLds_manu = true;
+    static constexpr auto BEnableLds_manu = true;

    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
+#include "ck/utility/get_id.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -68,6 +69,12 @@ struct PassThrough
        y = x;
    }

+    template <>
+    __host__ __device__ void operator()<uint8_t, uint8_t>(uint8_t& y, const uint8_t& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
    {
@@ -371,6 +378,90 @@ struct Swish
    float beta_ = 1.0f;
 };

+// support fastconvert of int8 to fp16
+
+template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
+struct FastNumericArrayConverter
+{
+};
+
+template <>
+struct FastNumericArrayConverter<uint8_t, ck::half_t, 4>
+{
+    using InputArray  = vector_type<uint8_t, 4>;
+    using OutputArray = vector_type<ck::half_t, 4>;
+
+    __device__ static OutputArray convert(InputArray const& Input)
+    {
+        OutputArray Output;
+
+        uint32_t* half_2       = reinterpret_cast<uint32_t*>(&Output);
+        uint32_t const uint8_4 = reinterpret_cast<uint32_t const&>(Input);
+
+        // printf("Tid: %03d, uint8_4: %08x\n",
+        // get_thread_local_1d_id(),
+        // uint8_4);
+
+        static constexpr uint32_t byte_selector_01 = 0x05010500;
+        static constexpr uint32_t byte_selector_23 = 0x05030502;
+        static constexpr uint32_t fp16_adder       = 0x64646464;
+        half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01);
+        half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23);
+
+        // printf("Tid: %03d, Part1 converted: %08x | %08x\n",
+        // get_thread_local_1d_id(),
+        // half_2[Number<0>{}],
+        // half_2[Number<1>{}]);
+
+        // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed
+        // integer as fp16.
+        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n"
+                     : "=v"(half_2[0])
+                     : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM));
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n"
+                     : "=v"(half_2[1])
+                     : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM));
+        // printf("Tid: %03d, Part2 converted: %08x | %08x\n",
+        // get_thread_local_1d_id(),
+        // half_2[Number<0>{}],
+        // half_2[Number<1>{}]);
+        return Output;
+    }
+
+    __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); }
+};
+
+template <index_t N>
+struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
+{
+    static constexpr int VEC_WIDTH = 4;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+    using InputArray  = vector_type<uint8_t, N>;
+    using OutputArray = vector_type<ck::half_t, N>;
+
+    __device__ static OutputArray convert(InputArray const& Input)
+    {
+        FastNumericArrayConverter<uint8_t, ck::half_t, 4> converter;
+
+        OutputArray Output;
+
+        using Vec_InputArray  = vector_type<uint8_t, 4>;
+        using Vec_OutputArray = vector_type<ck::half_t, 4>;
+
+        Vec_OutputArray* half_4_ptr       = reinterpret_cast<Vec_OutputArray*>(&Output);
+        Vec_InputArray const* uint8_4_ptr = reinterpret_cast<Vec_InputArray const*>(&Input);
+
+        static_for<0, N / VEC_WIDTH, 1>{}(
+            [&](auto i) { half_4_ptr[i] = converter(uint8_4_ptr[i]); });
+
+        return Output;
+    }
+
+    __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); }
+};
+
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -52,11 +52,13 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
    defined(__gfx1102__))
    __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
-    if (false && get_thread_local_1d_id()==0){
+    if(false && get_thread_local_1d_id() == 0)
+    {
        printf("lds_size: %lu\n", GridwiseGemm::SharedMemTrait::lds_size);
        printf("lds_a_size: %d\n", GridwiseGemm::SharedMemTrait::a_block_space_size_aligned);
        printf("lds_b_size: %d\n", GridwiseGemm::SharedMemTrait::b_block_space_size_aligned);
-        printf("lds_scale_size: %d\n", GridwiseGemm::SharedMemTrait::scale_block_space_size_aligned);
+        printf("lds_scale_size: %d\n",
+               GridwiseGemm::SharedMemTrait::scale_block_space_size_aligned);
    }

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -459,17 +461,12 @@ struct GridwiseFpAintBGemm_Wmma

                // Workaround, Freeze transform
                return make_naive_tensor_descriptor(make_tuple(Number<KWmma * K0PerWmma>{},
-                                                                      Number<NRepeat>{},
-                                                                      I1,
-                                                                      Number<B_KRow>{},
-                                                                      I1,
-                                                                      Number<B_K1>{}),
-                                                    make_tuple(I0,
-                                                                      I1,
-                                                                      I0,
-                                                                      I0,
-                                                                      I0,
-                                                                      I0));
+                                                               Number<NRepeat>{},
+                                                               I1,
+                                                               Number<B_KRow>{},
+                                                               I1,
+                                                               Number<B_K1>{}),
+                                                    make_tuple(I0, I1, I0, I0, I0, I0));
            }
        }();

@@ -642,10 +639,12 @@ struct GridwiseFpAintBGemm_Wmma
                       : 0;

        static constexpr auto a_block_space_offset = 0;
-        static constexpr auto b_block_space_offset = 
-            (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType)/sizeof(BDataType);
+        static constexpr auto b_block_space_offset =
+            (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType) /
+            sizeof(BDataType);
        static constexpr auto scale_block_space_offset =
-            (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType)/sizeof(ScaleDataType);
+            (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType) /
+            sizeof(ScaleDataType);

        // LDS allocation for C shuffle in LDS
        static constexpr auto c_shuffle_block_space_size =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -719,7 +719,8 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, false>
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
        b_blockwise_copy.Run(
            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf);
-        scale_blockwise_copy.Run(scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf);
+        scale_blockwise_copy.Run(
+            scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1145,7 +1145,7 @@ struct ThreadwiseTensorSliceTransfer_v4
                src_desc, src_data_coord);
 #if 0
            printf("Tid: %03d, LDS read offset: %d\n", get_thread_local_1d_id(), src_data_coord.GetOffset());
-#endif 
+#endif
            // copy data from src_buf into src_tmp_vector
            if constexpr(SrcBuffer::IsDynamicBuffer())
            {
@@ -1419,7 +1419,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
                                                    1,
                                                    0);
                v_theother_row = type_convert_sp<SrcData>(temp);
-                
+
                if(get_thread_local_1d_id() % 32 < 16)
                {
                    // apply type convert

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -207,10 +207,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            // copy data from src_buf into src_vector_container
            auto src_vector_container = src_vector_type{
                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
-            if (false){
+            if(false)
+            {
                printf("Tid: %03d, a_grid_buf: %04x\n",
-                        get_thread_local_1d_id(),
-                        *(reinterpret_cast<const uint16_t*>(&src_vector_container.template AsType<SrcData>()[Number<0>{}])));
+                       get_thread_local_1d_id(),
+                       *(reinterpret_cast<const uint16_t*>(
+                           &src_vector_container.template AsType<SrcData>()[Number<0>{}])));
            }
            // copy data from src_vector_container into src_thread_scratch_
            src_thread_scratch_tuple_(thread_scratch_id)

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -312,7 +312,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

    if constexpr(is_same<T, double>::value)
@@ -614,6 +615,114 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
                                                              static_cast<index_t>(coherence));

            return bit_cast<int8x16_t>(tmp);
+#endif
+        }
+    }
+    else if constexpr(is_same<T, uint8_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
+#else
+            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+
+            return bit_cast<uint8x2_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 4)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                    src_thread_addr_offset,
+                                                    src_wave_addr_offset,
+                                                    static_cast<index_t>(coherence));
+#else
+            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+
+            return bit_cast<uint8x4_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 8)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<uint8_t, 8> tmp;
+
+            tmp.AsType<uint8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
+
+            tmp.AsType<uint8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 static_cast<index_t>(coherence));
+
+            return tmp.AsType<uint8x8_t>()(Number<0>{});
+#else
+            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<uint8x8_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 16)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<uint8_t, 16> tmp;
+
+            tmp.AsType<uint8x4_t>()(Number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence));
+
+            tmp.AsType<uint8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 static_cast<index_t>(coherence));
+
+            tmp.AsType<uint8x4_t>()(Number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
+                                                 static_cast<index_t>(coherence));
+
+            tmp.AsType<uint8x4_t>()(Number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
+                                                 static_cast<index_t>(coherence));
+
+            return tmp.AsType<uint8x16_t>()(Number<0>{});
+#else
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<uint8x16_t>(tmp);
 #endif
        }
    }

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -133,6 +133,13 @@ struct scalar_type<int8_t>
    static constexpr index_t vector_size = 1;
 };

+template <>
+struct scalar_type<uint8_t>
+{
+    using type                           = uint8_t;
+    static constexpr index_t vector_size = 1;
+};
+
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 template <>
 struct scalar_type<int4_t>
@@ -944,6 +951,15 @@ using int8x16_t = typename vector_type<int8_t, 16>::type;
 using int8x32_t = typename vector_type<int8_t, 32>::type;
 using int8x64_t = typename vector_type<int8_t, 64>::type;

+// u8
+// i8
+using uint8x2_t  = typename vector_type<uint8_t, 2>::type;
+using uint8x4_t  = typename vector_type<uint8_t, 4>::type;
+using uint8x8_t  = typename vector_type<uint8_t, 8>::type;
+using uint8x16_t = typename vector_type<uint8_t, 16>::type;
+using uint8x32_t = typename vector_type<uint8_t, 32>::type;
+using uint8x64_t = typename vector_type<uint8_t, 64>::type;
+
 // Convert X to Y
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert(X x)

--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
-# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'