Merge branch 'develop' into amd-develop

8f41bd8e · Jun Liu · 7f65ac05 · d7f05fb9 · 8f41bd8e · 8f41bd8e
Commit 8f41bd8e authored Apr 11, 2024 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kk
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kn
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mk
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mn
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_k
                                   BF16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_k
                                   BF16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_m
                                   BF16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_m
                                   BF16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kk
                                   F16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kn
                                   F16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mk
                                   F16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mn
                                   F16,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance =
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance =
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance =
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance =
                                   F32,
                                   PassThrough,
                                   PassThrough,
-                                   Scale>;
+                                   Scale,
+                                   2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kk
                                       F32,
                                       PassThrough,
                                       PassThrough,
-                                       Scale>;
+                                       Scale,
+                                       2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kn
                                       F32,
                                       PassThrough,
                                       PassThrough,
-                                       Scale>;
+                                       Scale,
+                                       2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,

--- a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
@@ -31,7 +31,8 @@ using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mk
                                       F32,
                                       PassThrough,
                                       PassThrough,
-                                       Scale>;
+                                       Scale,
+                                       2>;

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,