Fix cmake warnings (#1342)

* Cmake add -Wno-nvcc-compt * Remove template without initialization list * dpp remove template without init list * Fixes

Fix cmake warnings (#1342)
* Cmake add -Wno-nvcc-compt * Remove template without initialization list * dpp remove template without init list * Fixes
510325a4 · Bartłomiej Kocot · GitHub · 1da802bd · 510325a4 · 510325a4
Unverified Commit 510325a4 authored Jun 21, 2024 by Bartłomiej Kocot Committed by GitHub Jun 21, 2024
12 changed files
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -2,7 +2,7 @@
 #
 # MIT License
 #
-# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# Copyright (c) 2017-2024 Advanced Micro Devices, Inc.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -96,6 +96,7 @@ else()
                -Wno-covered-switch-default
                -Wno-unsafe-buffer-usage
                -Wno-unused-lambda-capture
+                -Wno-nvcc-compat
            )
        else()
            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -300,9 +300,9 @@ struct BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    dpp_gemm.template Run(a_thread_vec.template AsType<dpp_input_type>(),
+                    dpp_gemm.Run(a_thread_vec.template AsType<dpp_input_type>(),
-                                          b_thread_vec.template AsType<dpp_input_type>(),
+                                 b_thread_vec.template AsType<dpp_input_type>(),
-                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
        });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -613,7 +613,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -681,7 +681,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -749,10 +749,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -808,10 +807,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -840,10 +838,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -901,10 +898,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -939,10 +935,9 @@ struct BlockwiseGemmXdlops_pipeline_v4
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -259,7 +259,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -319,10 +319,9 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -584,7 +583,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                    block_sync_lds();
                                    __builtin_amdgcn_sched_barrier(0);
                                }
-                                xdlops_gemm.template Run(
+                                xdlops_gemm.Run(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -668,7 +667,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                block_sync_lds();
                                __builtin_amdgcn_sched_barrier(0);
                            }
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -303,7 +303,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                constexpr index_t c_offset =
                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                                xdlops_gemm.template Run(
+                                xdlops_gemm.Run(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -374,7 +374,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -428,10 +428,9 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -480,10 +479,9 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -821,7 +819,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                        block_sync_lds();
                                        __builtin_amdgcn_sched_barrier(0);
                                    }
-                                    xdlops_gemm.template Run(
+                                    xdlops_gemm.Run(
                                        a_thread_vec.template AsType<mfma_input_type>(),
                                        b_thread_vec.template AsType<mfma_input_type>(),
                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -914,7 +912,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                    block_sync_lds();
                                    __builtin_amdgcn_sched_barrier(0);
                                }
-                                xdlops_gemm.template Run(
+                                xdlops_gemm.Run(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -990,7 +988,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                block_sync_lds();
                                __builtin_amdgcn_sched_barrier(0);
                            }
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -1066,7 +1064,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                block_sync_lds();
                                __builtin_amdgcn_sched_barrier(0);
                            }
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -381,7 +381,7 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            xdlops_gemm.template Run(
+                            xdlops_gemm.Run(
                                a_thread_vec.template AsType<mfma_input_type>(),
                                b_thread_vec.template AsType<mfma_input_type>(),
                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -440,10 +440,9 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -403,7 +403,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                constexpr index_t c_offset =
                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                                xdlops_gemm.template Run(
+                                xdlops_gemm.Run(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -472,10 +472,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -529,10 +528,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });
@@ -562,10 +560,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -444,7 +444,7 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                                constexpr index_t c_offset =
                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                                xdlops_gemm.template Run(
+                                xdlops_gemm.Run(
                                    a_thread_vec.template AsType<mfma_input_type>(),
                                    b_thread_vec.template AsType<mfma_input_type>(),
                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -513,10 +513,9 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                    a_thread_copy_.Run(
                        a_block_desc_m0_m1_m2_k,
@@ -564,10 +563,9 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                    a_thread_copy_.Run(
@@ -607,10 +605,9 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    xdlops_gemm.template Run(
+                    xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                        a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -352,10 +352,9 @@ struct BlockwiseGemmWMMA
                            constexpr index_t c_offset =
                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                            wmma_gemm.template Run(
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
-                                b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        });
                    });
                });
@@ -411,10 +410,9 @@ struct BlockwiseGemmWMMA
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                        wmma_gemm.template Run(
+                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                            a_thread_vec.template AsType<wmma_input_type_a>(),
+                                      b_thread_vec.template AsType<wmma_input_type_b>(),
-                            b_thread_vec.template AsType<wmma_input_type_b>(),
+                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                    });
                });
            });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -340,10 +340,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    xdlops_gemm.template Run(
+                    xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type_a>(),
-                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
-                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
        });
@@ -530,10 +529,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        // TODO: insert setprio in more precise manner since we
                        // could have more than >1 MFMA instructions in single call
-                        xdlops_gemm.template Run(
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
                            __builtin_amdgcn_sched_barrier(0);
@@ -963,10 +961,9 @@ struct BlockwiseGemmXdlops_v2
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    xdlops_gemm.template Run(
+                    xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                        a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
        });

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -281,10 +281,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    xdlops_gemm.template Run(
+                    xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
-                        a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
        });

--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <cstdlib>
 #include <iostream>
@@ -207,7 +207,7 @@ TEST_F(TestGroupedConvndFwdMultiAInterface, MultiA)
    std::array<const void*, NumAs> as{nullptr, nullptr};
    const void* b = nullptr;
-    EXPECT_TRUE(this->template Run(as, b));
+    EXPECT_TRUE(this->Run(as, b));
 }
 TEST_F(TestGroupedConvndFwdMultiBInterface, MultiB)
@@ -215,7 +215,7 @@ TEST_F(TestGroupedConvndFwdMultiBInterface, MultiB)
    const void* a = nullptr;
    std::array<const void*, NumBs> bs{nullptr, nullptr};
-    EXPECT_TRUE(this->template Run(a, bs));
+    EXPECT_TRUE(this->Run(a, bs));
 }
 TEST_F(TestGroupedConvndFwdMultiABInterface, MultiAB)
@@ -223,7 +223,7 @@ TEST_F(TestGroupedConvndFwdMultiABInterface, MultiAB)
    std::array<const void*, NumAs> as{nullptr, nullptr};
    std::array<const void*, NumBs> bs{nullptr, nullptr};
-    EXPECT_TRUE(this->template Run(as, bs));
+    EXPECT_TRUE(this->Run(as, bs));
 }
 TEST_F(TestGroupedConvndFwdInterface, SingleAB)
@@ -231,5 +231,5 @@ TEST_F(TestGroupedConvndFwdInterface, SingleAB)
    const void* a = nullptr;
    const void* b = nullptr;
-    EXPECT_TRUE(this->template Run(a, b));
+    EXPECT_TRUE(this->Run(a, b));
 }