resolved conflicts

f23a2e2a · Jakub Piasecki · f3eb5a18 · c0adab48 · f23a2e2a · f23a2e2a
Commit f23a2e2a authored Feb 11, 2025 by Jakub Piasecki
20 changed files
--- a/codegen/test/rtc/include/rtc/manage_ptr.hpp
+++ b/codegen/test/rtc/include/rtc/manage_ptr.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER


--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR


--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <rtc/hip.hpp>
 #include <rtc/compile_kernel.hpp>
 #include <rtc/tmp_dir.hpp>

--- a/codegen/test/rtc/src/hip.cpp
+++ b/codegen/test/rtc/src/hip.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <rtc/hip.hpp>
 #include <rtc/manage_ptr.hpp>
 #include <stdexcept>

--- a/codegen/test/rtc/src/kernel.cpp
+++ b/codegen/test/rtc/src/kernel.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <rtc/kernel.hpp>
 #include <rtc/manage_ptr.hpp>
 #include <rtc/hip.hpp>
+#include <stdexcept>
 #include <cassert>

 // extern declare the function since hip/hip_ext.h header is broken

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <rtc/tmp_dir.hpp>
 #include <algorithm>
 #include <random>

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.14.1
+rocm-docs-core==1.15.0
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -199,7 +199,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.14.1
+rocm-docs-core==1.15.0
    # via -r requirements.in
 rpds-py==0.22.3
    # via

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -61,7 +61,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)

 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)

-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -31,9 +31,7 @@ using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 // ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>;
-// // clang-format on

-// clang-format off
 using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|

--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -16,7 +16,7 @@ if(USE_BITINT_EXTENSION_INT4)
    add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
 endif(USE_BITINT_EXTENSION_INT4)

-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/24_batched_gemm/CMakeLists.txt
+++ b/example/24_batched_gemm/CMakeLists.txt
@@ -22,3 +22,6 @@ if(USE_BITINT_EXTENSION_INT4)
    add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp)
    add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
 endif()
+
+add_example_executable(example_batched_gemm_xdl_fp16int4_b_scale_v3 batched_gemm_xdl_fp16int4_b_scale_v3.cpp)
+add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16int4_b_scale_v3)
--- a/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = ck::pk_i4_t;
+using BScaleDataType   = ck::half_t;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using CDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto PermuteA    = false;
+static constexpr bool PermuteB    = false;
+
+static constexpr ck::index_t Scale_Block_N = 1;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+static constexpr ck::index_t KPerBlock = 256;
+
+// clang-format off
+using DeviceBatchedGemmV2Instance = 
+    ck::tensor_operation::device::DeviceBatchedGemm_Xdl_CShuffleV3_BScale<  
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256, Scale_Block_N, Scale_Block_K,
+        16, 64,
+        KPerBlock, 8, 32,
+        16,   16,
+        1,    1,
+        S<32, 8, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 16, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;
+// clang-format on
+
+using ReferenceBatchedGemmInstance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                      AccDataType,
+                                                                                      CDataType,
+                                                                                      AccDataType,
+                                                                                      AElementOp,
+                                                                                      BElementOp,
+                                                                                      CElementOp>;
+#include "run_batched_gemm_example_fp16int4_b_scale.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_fp16_int4_b_scale_example(argc, argv); }
--- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#include <random>
+
+#pragma once
+struct ProblemSize final
+{
+    ck::index_t M = 128;
+    ck::index_t N = 128;
+    ck::index_t K = 384;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t batch_stride_A = M * K;
+    ck::index_t batch_stride_B = K * N;
+    ck::index_t batch_stride_C = M * N;
+
+    // Batched Gemm count
+    ck::index_t batch_count = 2;
+
+    // Split K count
+    ck::index_t KBatch = 1;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+};
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M,
+           N,
+           K,
+           stride_A,
+           stride_B,
+           stride_C,
+           batch_stride_A,
+           batch_stride_B,
+           batch_stride_C,
+           batch_count,
+           KBatch] = problem_size;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t batch_BScale_Stride =
+        ((K + Scale_Block_K - 1) / Scale_Block_K) * ((N + Scale_Block_N - 1) / Scale_Block_N);
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<BDataType> b_g_k_n_permute(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<BScaleDataType> b1_g_k_n(
+        f_host_tensor_descriptor(batch_count,
+                                 (K + Scale_Block_K - 1) / Scale_Block_K,
+                                 (N + Scale_Block_N - 1) / Scale_Block_N,
+                                 Scale_Stride_BN,
+                                 batch_BScale_Stride,
+                                 BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 4:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 5:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.5, 0.5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_g_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+    }
+
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{}));
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_k_n: " << b1_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * b_g_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_scale_device_buf(sizeof(BScaleDataType) * b1_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_g_m_n_device_buf(sizeof(CDataType) *
+                                 c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    printf("a_g_m_k size: %zu, b_g_k_n size: %zu, b1_g_k_n size: %zu, c_g_m_n size: %zu\n",
+           a_g_m_k.mDesc.GetElementSpaceSize(),
+           b_g_k_n_permute.mDesc.GetElementSpaceSize(),
+           b1_g_k_n.mDesc.GetElementSpaceSize(),
+           c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        printf("Permute B\n");
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int bs = 0; bs < batch_count; bs++)
+        {
+            for(int j = 0; j < K0; j++)
+            {
+                for(int i = 0; i < N; i++)
+                {
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_g_k_n_permute(bs * batch_stride_B + j * N * K1 + i * K1 + jj) =
+                            b_g_k_n(bs * batch_stride_B + i * K + (j * K1 + jj));
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        b_g_k_n_permute = b_g_k_n;
+    }
+
+    // vector pk_i4x4 permute
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j += 8)
+            {
+                int input[8];
+
+                for(int k = 0; k < 4; k++)
+                {
+                    int i4x2         = b_g_k_n_permute(bs, j + k * 2, i).data;
+                    input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                    input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                }
+
+                // permute 01234567->20643175
+                {
+                    int hi   = input[2];
+                    int lo   = input[0];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b_g_k_n_permute(bs, j + 0, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[6];
+                    int lo   = input[4];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b_g_k_n_permute(bs, j + 2, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[3];
+                    int lo   = input[1];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b_g_k_n_permute(bs, j + 4, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[7];
+                    int lo   = input[5];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b_g_k_n_permute(bs, j + 6, i) = i4x2;
+                }
+            }
+        }
+    }
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_g_k_n_device_buf.ToDevice(b_g_k_n_permute.mData.data());
+    b1_g_scale_device_buf.ToDevice(b1_g_k_n.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceBatchedGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<BDataType*>(b_g_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_g_m_n_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          stride_A,
+                          stride_B,
+                          stride_C,
+                          Scale_Stride_BN,
+                          batch_stride_A,
+                          batch_stride_B,
+                          batch_stride_C,
+                          batch_BScale_Stride,
+                          static_cast<BScaleDataType*>(b1_g_scale_device_buf.GetDeviceBuffer()),
+                          batch_count, // batch count
+                          KBatch,      // split K count
+                          a_element_op,
+                          b_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    Tensor<float> b_g_k_n_dequant({batch_count, K, N});
+    if(config.do_verification)
+    {
+        float v_b = 0;
+        for(int bs = 0; bs < batch_count; bs++)
+        {
+            for(int n = 0; n < N; n++)
+            {
+                for(int k = 0; k < K; k++)
+                {
+                    ck::pk_i4_t i4x2 = b_g_k_n(bs, k, n).data;
+                    int8_t i4        = 0;
+                    if(k % 2 == 1)
+                        i4 = (i4x2.data >> 0) & 0xf;
+                    else
+                        i4 = (i4x2.data >> 4) & 0xf;
+                    i4  = i4 - 8;
+                    v_b = ck::type_convert<float>(i4);
+
+                    b_g_k_n_dequant(bs, k, n) =
+                        ck::type_convert<float>(v_b) *
+                        ck::type_convert<float>(b1_g_k_n(bs, k / Scale_Block_K, n / Scale_Block_N));
+                }
+            }
+        }
+
+        auto ref_gemm    = ReferenceBatchedGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_g_m_k,
+                                                  b_g_k_n_dequant,
+                                                  c_g_m_n_host_result,
+                                                  PassThrough{},
+                                                  PassThrough{},
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        hip_check_error(hipDeviceSynchronize());
+
+        c_g_m_n_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_g_m_n_device_result,
+                                     c_g_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+#if 0
+    // print A matrix
+    printf("A matrix:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf("batch %d -> Address: %p\n", bs, static_cast<void*>(&a_g_m_k(bs, 0, 0)));
+        for(int i = 0; i < M; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                printf("%.2f,", static_cast<float>(a_g_m_k(bs, i, j)));
+            }
+            printf("\n");
+        }
+    }
+
+    // print B matrix original
+    printf("B matrix original:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf("batch %d -> Address: %p\n", bs, static_cast<void*>(&b_g_k_n(bs, 0, 0)));
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_g_k_n(bs, k, n).data;
+                int8_t i4        = 0;
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+                i4 = i4 - 8;
+                printf("%d,", static_cast<int>(i4));
+            }
+            printf("\n");
+        }
+    }
+
+    // print B matrix
+    printf("B matrix:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf("batch %d -> Address: %p\n", bs, static_cast<void*>(&b_g_k_n_dequant(bs, 0, 0)));
+        for(int i = 0; i < K; i++)
+        {
+            for(int j = 0; j < N; j++)
+            {
+                printf("%.2f, ", static_cast<float>(b_g_k_n_dequant(bs, i, j)));
+            }
+            printf("\n");
+        }
+    }
+
+    // print B scale matrix
+    printf("B Scale matrix:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf("batch %d -> Address: %p\n", bs, static_cast<void*>(&b1_g_k_n(bs, 0, 0)));
+        for(int i = 0; i < (K + Scale_Block_K - 1) / Scale_Block_K; i++)
+        {
+            for(int j = 0; j < (N + Scale_Block_N - 1) / Scale_Block_N; j++)
+            {
+                printf("%.2f, ", static_cast<float>(b1_g_k_n(bs, i, j)));
+            }
+            printf("\n");
+        }
+    }
+
+    // print C matrix
+    printf("C matrix:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf(
+            "batch %d -> Address: %p\n", bs, static_cast<void*>(&c_g_m_n_device_result(bs, 0, 0)));
+        for(int i = 0; i < M; i++)
+        {
+            for(int j = 0; j < N; j++)
+            {
+                printf("%.2f, ", static_cast<float>(c_g_m_n_device_result(bs, i, j)));
+            }
+            printf("\n");
+        }
+    }
+
+    printf("C reference matrix:\n");
+    for(int bs = 0; bs < batch_count; bs++)
+    {
+        printf("batch %d -> Address: %p\n", bs, static_cast<void*>(&c_g_m_n_host_result(bs, 0, 0)));
+        for(int i = 0; i < M; i++)
+        {
+            for(int j = 0; j < N; j++)
+            {
+                printf("%.2f, ", static_cast<float>(c_g_m_n_host_result(bs, i, j)));
+            }
+            printf("\n");
+        }
+    }
+#endif
+
+    return pass;
+}
+
+bool run_batched_gemm_fp16_int4_b_scale_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    std::mt19937 gen(11939);
+    std::uniform_int_distribution<int> dis(0, 15);
+
+    problem_size.M = 128 * (dis(gen) + 1);
+    problem_size.N = 128 * (dis(gen) + 1);
+    problem_size.K = 256 * (dis(gen) + 2);
+
+    problem_size.batch_count = 2;
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc >= 7)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        if(argc >= 8)
+        {
+            problem_size.batch_count = std::stoi(argv[7]);
+        }
+
+        if(argc >= 9)
+        {
+            problem_size.KBatch = std::stoi(argv[8]);
+        }
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
+    return run_batched_gemm(problem_size, config);
+}
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -32,6 +32,56 @@ using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
 template <ck::index_t NDimSpatial>
 using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;

+#if defined(CK_USE_AMD_MFMA_GFX950)
+template <ck::index_t NDimSpatial>
+using DeviceConvFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InputLayout<NDimSpatial>,
+        WeightLayout<NDimSpatial>,
+        ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
+        OutputLayout<NDimSpatial>,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+#else  // defined(CK_USE_AMD_MFMA_GFX950)
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
@@ -80,6 +130,7 @@ using DeviceConvFwdInstance =
        1,
        S<1, 16, 1, 16>,
        4>;
+#endif // defined(CK_USE_AMD_MFMA_GFX950)

 template <ck::index_t NDimSpatial>
 using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,

--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -5,6 +5,6 @@ if(USE_BITINT_EXTENSION_INT4)
   add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)

-if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95" AND NOT GPU_TARGETS MATCHES "gfx1")
   add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
 endif()
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -5,6 +5,6 @@ if(USE_BITINT_EXTENSION_INT4)
   add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
 endif(USE_BITINT_EXTENSION_INT4)

-if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1")
+if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx95" AND NOT GPU_TARGETS MATCHES "gfx1")
   add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
 endif()
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)

--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)