Merge branch 'develop' into lwpck-976

1b5af83d · illsilin · aac26d32 · 1fd27d52 · 1b5af83d · 1b5af83d
Commit 1b5af83d authored Oct 20, 2023 by illsilin
20 changed files
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -192,6 +192,8 @@ inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b,
 #else
    c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
 #endif
+#elif defined(CK_USE_AMD_V_DOT4_I32_I8_GFX11)
+    c = __builtin_amdgcn_sudot4(true, bit_cast<int32_t>(a), true, bit_cast<int32_t>(b), c, false);
 #else
    const vector_type<int8_t, 4> a_vector{a};
    const vector_type<int8_t, 4> b_vector{b};

--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -150,28 +150,6 @@ __host__ __device__ constexpr T clamp(const T& x, const T& lowerbound, const T&
    return min(max(x, lowerbound), upperbound);
 }
-// disallow implicit type casting
-template <typename T>
-__device__ T exp(T x);
-// TODO: add f16 support using v_exp_f16
-template <>
-__device__ float exp<float>(float x)
-{
-    return __expf(x);
-}
-template <>
-__device__ double exp<double>(double x)
-{
-    return exp(x);
-}
-static inline __host__ float exp(float x) { return std::expf(x); }
-static inline __host__ double exp(double x) { return std::exp(x); }
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
 {

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type.hpp"
+#include "ck/utility/type_convert.hpp"
 namespace ck {
 namespace math {
@@ -92,14 +93,96 @@ static inline __host__ float sqrt(float x) { return std::sqrt(x); };
 static inline __host__ double sqrt(double x) { return std::sqrt(x); };
-static inline __host__ half_t tanh(half_t x)
+template <typename T>
+inline __host__ T tanh(T x)
 {
-    return static_cast<half_t>(std::tanh(static_cast<float>(x)));
+    return ck::type_convert<T>(std::tanhf(ck::type_convert<float>(x)));
 };
-static inline __host__ float tanh(float x) { return std::tanh(x); };
+template <>
+inline __host__ float tanh<float>(float x)
+{
+    return std::tanhf(x);
+};
+template <>
+inline __host__ double tanh<double>(double x)
+{
+    return std::tanh(x);
+};
+template <typename T>
+inline __host__ T exp(T x)
+{
+    return ck::type_convert<T>(std::expf(ck::type_convert<float>(x)));
+}
+template <>
+inline __host__ float exp<float>(float x)
+{
+    return std::expf(x);
+}
-static inline __host__ double tanh(double x) { return std::tanh(x); };
+template <>
+inline __host__ double exp<double>(double x)
+{
+    return std::exp(x);
+}
+template <typename T>
+inline __host__ T log(T x)
+{
+    return ck::type_convert<T>(std::logf(ck::type_convert<float>(x)));
+}
+template <>
+inline __host__ float log<float>(float x)
+{
+    return std::logf(x);
+}
+template <>
+inline __host__ double log<double>(double x)
+{
+    return std::log(x);
+}
+template <typename T>
+inline __host__ T pow(T x, T gamma)
+{
+    return ck::type_convert<T>(
+        std::powf(ck::type_convert<float>(x), ck::type_convert<float>(gamma)));
+}
+template <>
+inline __host__ float pow<float>(float x, float gamma)
+{
+    return std::powf(x, gamma);
+}
+template <>
+inline __host__ double pow<double>(double x, double gamma)
+{
+    return std::pow(x, gamma);
+}
+template <typename T>
+inline __host__ T expm1(T x)
+{
+    return ck::type_convert<T>(std::expm1f(ck::type_convert<float>(x)));
+}
+template <>
+inline __host__ float expm1<float>(float x)
+{
+    return std::expm1f(x);
+}
+template <>
+inline __host__ double expm1<double>(double x)
+{
+    return std::expm1(x);
+}
 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions
@@ -181,14 +264,107 @@ static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x);
 static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
-static inline __device__ half_t tanh(half_t x)
+template <typename T>
+inline __device__ T tanh(T x)
+{
+    return ck::type_convert<T>(::tanhf(ck::type_convert<float>(x)));
+};
+template <>
+inline __device__ float tanh<float>(float x)
 {
-    return static_cast<half_t>(::tanhf(static_cast<float>(x)));
+    return ::tanhf(x);
 };
-static inline __device__ float tanh(float x) { return ::tanhf(x); };
+template <>
+inline __device__ double tanh<double>(double x)
+{
+    return ::tanh(x);
+};
+template <typename T>
+inline __device__ T exp(T x)
+{
+    return ck::type_convert<T>(__expf(ck::type_convert<float>(x)));
+};
+template <>
+inline __device__ half_t exp<half_t>(half_t x)
+{
+    return hexp(x);
+};
+template <>
+inline __device__ float exp<float>(float x)
+{
+    return __expf(x);
+};
-static inline __device__ double tanh(double x) { return ::tanh(x); };
+template <>
+inline __device__ double exp<double>(double x)
+{
+    return exp(x);
+};
+template <typename T>
+inline __device__ T log(T x)
+{
+    return ck::type_convert<T>(__logf(ck::type_convert<float>(x)));
+};
+template <>
+inline __device__ half_t log<half_t>(half_t x)
+{
+    return hlog(x);
+};
+template <>
+inline __device__ float log<float>(float x)
+{
+    return __logf(x);
+};
+template <>
+inline __device__ double log<double>(double x)
+{
+    return log(x);
+};
+template <typename T>
+inline __device__ T pow(T x, T gamma)
+{
+    return ck::type_convert<T>(powf(ck::type_convert<float>(x), ck::type_convert<float>(gamma)));
+};
+template <>
+inline __device__ float pow<float>(float x, float gamma)
+{
+    return powf(x, gamma);
+};
+template <>
+inline __device__ double pow<double>(double x, double gamma)
+{
+    return pow(x, gamma);
+};
+template <typename T>
+inline __device__ T expm1(T x)
+{
+    return ck::type_convert<T>(expm1f(ck::type_convert<float>(x)));
+};
+template <>
+inline __device__ float expm1<float>(float x)
+{
+    return expm1f(x);
+};
+template <>
+inline __device__ double expm1<double>(double x)
+{
+    return expm1(x);
+};
 } // namespace math
 } // namespace ck
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -5,6 +5,7 @@
 #define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
 #include "common_header.hpp"
+#include "ck/utility/math_v2.hpp"
 namespace ck {

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -95,7 +95,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }
-#if defined CK_ENABLE_FP8
 // convert fp32 to fp8
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
@@ -169,9 +168,7 @@ inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
    return utils::cast_from_f8<f8_t, half_t, negative_zero_nan>(x);
 #endif
 }
-#endif
-#if defined CK_ENABLE_BF8
 // convert fp32 to bf8
 template <>
 inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
@@ -245,7 +242,6 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
    return utils::cast_from_f8<bf8_t, half_t, negative_zero_nan>(x);
 #endif
 }
-#endif
 // Declare a template function for bf16 conversion using RTN
 template <typename Y, typename X>
@@ -308,7 +304,6 @@ inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(h
 template <typename Y, typename X>
 __host__ __device__ constexpr Y f8_convert_sr(X x);
-#if defined CK_ENABLE_FP8
 // convert fp32 to fp8 with stochastic rounding
 template <>
 inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
@@ -344,7 +339,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
    // convert to float and use native converion
    return f8_convert_sr<f8_t>(type_convert<float>(x));
-#elif 0
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
@@ -353,13 +348,9 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
    return utils::
        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
            x, rng);
-#else
-    return type_convert<f8_t>(type_convert<float>(x));
 #endif
 }
-#endif
-#if defined CK_ENABLE_BF8
 // convert fp32 to bf8 with stochastic rounding
 template <>
 inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
@@ -395,7 +386,7 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
    // convert to float and use native converion
    return f8_convert_sr<f8_t>(type_convert<float>(x));
-#elif 0
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
@@ -405,10 +396,7 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
    return utils::
        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
            x, rng);
-#else
-    return type_convert<bf8_t>(type_convert<float>(x));
 #endif
 }
-#endif
 } // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -128,11 +128,9 @@ struct ReferenceConvFwd : public device::BaseOperator
                        }
                    }
-                    float v_out;
+                    OutDataType v_out;
+                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
-                    arg.out_element_op_(v_out, v_acc);
+                    arg.output_(g, n, k, wo) = v_out;
-                    arg.output_(g, n, k, wo) = ck::type_convert<OutDataType>(v_out);
                };
                make_ParallelTensorFunctor(func,
@@ -184,11 +182,9 @@ struct ReferenceConvFwd : public device::BaseOperator
                        }
                    }
-                    float v_out;
+                    OutDataType v_out;
+                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
-                    arg.out_element_op_(v_out, v_acc);
+                    arg.output_(g, n, k, ho, wo) = v_out;
-                    arg.output_(g, n, k, ho, wo) = ck::type_convert<OutDataType>(v_out);
                };
                make_ParallelTensorFunctor(func,
@@ -253,11 +249,9 @@ struct ReferenceConvFwd : public device::BaseOperator
                        }
                    }
-                    float v_out;
+                    OutDataType v_out;
+                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
-                    arg.out_element_op_(v_out, v_acc);
+                    arg.output_(g, n, k, d_o, ho, wo) = v_out;
-                    arg.output_(g, n, k, d_o, ho, wo) = ck::type_convert<OutDataType>(v_out);
                };
                make_ParallelTensorFunctor(func,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
@@ -20,8 +20,9 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
+          typename SaveMeanInvStdDataType,
-          typename AccElementwiseOperation>
+          typename ComputeDataType,
+          typename YElementwiseOperation>
 struct ReferenceGroupnorm : public device::BaseOperator
 {
    // x = [N, H, W, G, C]
@@ -35,14 +36,18 @@ struct ReferenceGroupnorm : public device::BaseOperator
                 const Tensor<GammaDataType>& gamma,
                 const Tensor<BetaDataType>& beta,
                 Tensor<YDataType>& y,
-                 AccElementwiseOperation acc_elementwise_op,
+                 Tensor<SaveMeanInvStdDataType>& save_mean,
+                 Tensor<SaveMeanInvStdDataType>& save_inv_std,
+                 YElementwiseOperation y_elementwise_op,
                 const std::vector<index_t> lengths,
-                 AccDataType epsilon)
+                 ComputeDataType epsilon)
            : x_(x),
              gamma_(gamma),
              beta_(beta),
              y_(y),
-              acc_elementwise_op_(acc_elementwise_op),
+              save_mean_(save_mean),
+              save_inv_std_(save_inv_std),
+              y_elementwise_op_(y_elementwise_op),
              lengths_(lengths),
              epsilon_(epsilon)
        {
@@ -52,9 +57,11 @@ struct ReferenceGroupnorm : public device::BaseOperator
        const Tensor<XDataType> gamma_;
        const Tensor<XDataType> beta_;
        Tensor<YDataType>& y_;
-        AccElementwiseOperation acc_elementwise_op_;
+        Tensor<SaveMeanInvStdDataType>& save_mean_;
+        Tensor<SaveMeanInvStdDataType>& save_inv_std_;
+        YElementwiseOperation y_elementwise_op_;
        std::vector<index_t> lengths_;
-        AccDataType epsilon_;
+        ComputeDataType epsilon_;
    };
    // Invoker
@@ -68,8 +75,8 @@ struct ReferenceGroupnorm : public device::BaseOperator
            int G = arg.lengths_[3];
            int C = arg.lengths_[4];
-            Tensor<AccDataType> mean({N, G});
+            Tensor<ComputeDataType> mean({N, G});
-            Tensor<AccDataType> var({N, G});
+            Tensor<ComputeDataType> var({N, G});
            // Compute mean & var in [H, W, C] by Welford Algorithm
            // TODO - parallel for each HWC
@@ -78,9 +85,9 @@ struct ReferenceGroupnorm : public device::BaseOperator
            {
                for(int g = 0; g < G; ++g)
                {
-                    AccDataType mean_val = type_convert<AccDataType>(0.0f);
+                    ComputeDataType mean_val = type_convert<ComputeDataType>(0.0f);
-                    AccDataType var_val  = type_convert<AccDataType>(0.0f);
+                    ComputeDataType var_val  = type_convert<ComputeDataType>(0.0f);
-                    int32_t curr_count   = 0;
+                    int32_t curr_count       = 0;
                    for(int h = 0; h < H; ++h)
                    {
@@ -89,10 +96,11 @@ struct ReferenceGroupnorm : public device::BaseOperator
                            for(int c = 0; c < C; ++c)
                            {
                                curr_count++;
-                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
+                                ComputeDataType x =
-                                AccDataType delta = x - mean_val;
+                                    type_convert<ComputeDataType>(arg.x_(n, h, w, g, c));
+                                ComputeDataType delta = x - mean_val;
                                mean_val += delta / curr_count;
-                                AccDataType delta2 = x - mean_val;
+                                ComputeDataType delta2 = x - mean_val;
                                var_val += delta * delta2;
                            }
                        }
@@ -100,6 +108,12 @@ struct ReferenceGroupnorm : public device::BaseOperator
                    mean(n, g) = mean_val;
                    var(n, g)  = var_val / curr_count;
+                    arg.save_mean_(n, g) = ck::type_convert<SaveMeanInvStdDataType>(mean(n, g));
+                    ComputeDataType divisor =
+                        static_cast<ComputeDataType>(1) / ck::math::sqrt(var(n, g) + arg.epsilon_);
+                    arg.save_inv_std_(n, g) = ck::type_convert<SaveMeanInvStdDataType>(divisor);
                }
            }
@@ -114,15 +128,19 @@ struct ReferenceGroupnorm : public device::BaseOperator
                        {
                            for(int c = 0; c < C; ++c)
                            {
-                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
+                                ComputeDataType x =
-                                AccDataType gamma    = type_convert<AccDataType>(arg.gamma_(g, c));
+                                    type_convert<ComputeDataType>(arg.x_(n, h, w, g, c));
-                                AccDataType beta     = type_convert<AccDataType>(arg.beta_(g, c));
+                                ComputeDataType gamma =
-                                AccDataType mean_val = type_convert<AccDataType>(mean(n, g));
+                                    type_convert<ComputeDataType>(arg.gamma_(g, c));
-                                AccDataType var_val  = type_convert<AccDataType>(var(n, g));
+                                ComputeDataType beta =
-                                AccDataType y        = gamma * (x - mean_val) /
+                                    type_convert<ComputeDataType>(arg.beta_(g, c));
-                                                    ck::math::sqrt(arg.epsilon_ + var_val) +
+                                ComputeDataType mean_val =
-                                                beta;
+                                    type_convert<ComputeDataType>(mean(n, g));
-                                arg.acc_elementwise_op_(y, y);
+                                ComputeDataType var_val = type_convert<ComputeDataType>(var(n, g));
+                                ComputeDataType y       = gamma * (x - mean_val) /
+                                                        ck::math::sqrt(arg.epsilon_ + var_val) +
+                                                    beta;
+                                arg.y_elementwise_op_(y, y);
                                arg.y_(n, h, w, g, c) = type_convert<YDataType>(y);
                            }
                        }
@@ -159,11 +177,14 @@ struct ReferenceGroupnorm : public device::BaseOperator
                             const Tensor<GammaDataType>& gamma,
                             const Tensor<BetaDataType>& beta,
                             Tensor<YDataType>& y,
-                             AccElementwiseOperation acc_elementwise_op,
+                             Tensor<SaveMeanInvStdDataType>& save_mean,
+                             Tensor<SaveMeanInvStdDataType>& save_inv_std,
+                             YElementwiseOperation y_elementwise_op,
                             const std::vector<index_t> lengths,
-                             AccDataType epsilon)
+                             ComputeDataType epsilon)
    {
-        return Argument{x, gamma, beta, y, acc_elementwise_op, lengths, epsilon};
+        return Argument{
+            x, gamma, beta, y, save_mean, save_inv_std, y_elementwise_op, lengths, epsilon};
    }
    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
@@ -20,8 +20,9 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
+          typename SaveMeanInvStdDataType,
-          typename AccElementwiseOperation,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
 struct ReferenceLayernorm : public device::BaseOperator
@@ -36,15 +37,19 @@ struct ReferenceLayernorm : public device::BaseOperator
                 const Tensor<GammaDataType>& gamma_n,
                 const Tensor<BetaDataType>& beta_n,
                 Tensor<YDataType>& y_m_n,
-                 AccElementwiseOperation acc_elementwise_op,
+                 Tensor<SaveMeanInvStdDataType>& save_mean_m,
+                 Tensor<SaveMeanInvStdDataType>& save_inv_std_m,
+                 YElementwiseOperation y_elementwise_op,
                 const std::vector<index_t> lengths,
                 const std::vector<index_t> reduceDims,
-                 AccDataType epsilon)
+                 ComputeDataType epsilon)
            : x_m_n_(x_m_n),
              gamma_n_(gamma_n),
              beta_n_(beta_n),
              y_m_n_(y_m_n),
-              acc_elementwise_op_(acc_elementwise_op),
+              save_mean_m_(save_mean_m),
+              save_inv_std_m_(save_inv_std_m),
+              y_elementwise_op_(y_elementwise_op),
              lengths_(lengths),
              reduceDims_(reduceDims),
              epsilon_(epsilon)
@@ -55,10 +60,12 @@ struct ReferenceLayernorm : public device::BaseOperator
        const Tensor<XDataType> gamma_n_;
        const Tensor<XDataType> beta_n_;
        Tensor<YDataType>& y_m_n_;
-        AccElementwiseOperation acc_elementwise_op_;
+        Tensor<SaveMeanInvStdDataType>& save_mean_m_;
+        Tensor<SaveMeanInvStdDataType>& save_inv_std_m_;
+        YElementwiseOperation y_elementwise_op_;
        std::vector<index_t> lengths_;
        std::vector<index_t> reduceDims_;
-        AccDataType epsilon_;
+        ComputeDataType epsilon_;
    };
    // Invoker
@@ -69,8 +76,8 @@ struct ReferenceLayernorm : public device::BaseOperator
            int M = arg.lengths_[0];
            int N = arg.lengths_[1];
-            Tensor<AccDataType> mean({M});
+            Tensor<ComputeDataType> mean({M});
-            Tensor<AccDataType> var({M});
+            Tensor<ComputeDataType> var({M});
            for(int m = 0; m < M; ++m)
            {
@@ -79,7 +86,7 @@ struct ReferenceLayernorm : public device::BaseOperator
                for(int n = 0; n < N; ++n)
                {
-                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
+                    auto x_val = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
                    mean(m) += x_val;
                    var(m) += x_val * x_val;
                }
@@ -90,17 +97,21 @@ struct ReferenceLayernorm : public device::BaseOperator
            for(int m = 0; m < M; ++m)
            {
-                AccDataType divisor =
+                ComputeDataType divisor =
-                    static_cast<AccDataType>(1) / ck::math::sqrt(var(m) + arg.epsilon_);
+                    static_cast<ComputeDataType>(1) / ck::math::sqrt(var(m) + arg.epsilon_);
                for(int n = 0; n < N; ++n)
                {
-                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
+                    auto x_val     = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
-                    auto y_val = (x_val - mean(m)) * divisor;
+                    auto gamma_val = ck::type_convert<ComputeDataType>(arg.gamma_n_(n));
-                    y_val      = (y_val * arg.gamma_n_(n)) + arg.beta_n_(n);
+                    auto beta_val  = ck::type_convert<ComputeDataType>(arg.beta_n_(n));
-                    arg.acc_elementwise_op_(y_val, y_val);
+                    auto y_val     = (x_val - mean(m)) * divisor;
+                    y_val          = (y_val * gamma_val) + beta_val;
+                    arg.y_elementwise_op_(y_val, y_val);
                    arg.y_m_n_(m, n) = ck::type_convert<YDataType>(y_val);
                }
+                arg.save_mean_m_(m)    = ck::type_convert<SaveMeanInvStdDataType>(mean(m));
+                arg.save_inv_std_m_(m) = ck::type_convert<SaveMeanInvStdDataType>(divisor);
            }
            return 0;
@@ -140,13 +151,23 @@ struct ReferenceLayernorm : public device::BaseOperator
                             const Tensor<GammaDataType>& gamma_n,
                             const Tensor<BetaDataType>& beta_n,
                             Tensor<YDataType>& y_m_n,
-                             AccElementwiseOperation acc_elementwise_op,
+                             Tensor<SaveMeanInvStdDataType>& save_mean_m,
+                             Tensor<SaveMeanInvStdDataType>& save_inv_std_m,
+                             YElementwiseOperation y_elementwise_op,
                             const std::vector<index_t> lengths,
                             const std::vector<index_t> reduceDims,
-                             AccDataType epsilon)
+                             ComputeDataType epsilon)
    {
-        return Argument{
+        return Argument{x_m_n,
-            x_m_n, gamma_n, beta_n, y_m_n, acc_elementwise_op, lengths, reduceDims, epsilon};
+                        gamma_n,
+                        beta_n,
+                        y_m_n,
+                        save_mean_m,
+                        save_inv_std_m,
+                        y_elementwise_op,
+                        lengths,
+                        reduceDims,
+                        epsilon};
    }
    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -20,12 +20,8 @@ using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using I32  = int32_t;
-#if defined CK_ENABLE_FP8
+using F8   = ck::f8_t;
-using F8 = ck::f8_t;
+using BF8  = ck::bf8_t;
-#endif
-#if defined CK_ENABLE_BF8
-using BF8 = ck::bf8_t;
-#endif
 using Empty_Tuple = ck::Tuple<>;

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -240,11 +240,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, NWC> && is_same_v<WeiLayout, KXC> &&
                     is_same_v<OutLayout, NWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -267,17 +269,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
-                          is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
+                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
-#ifdef DL_KERNELS
+            }
-                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
 #endif
+#if defined(DL_KERNELS) && defined(CK_ENABLE_FP32)
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -306,14 +314,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
-                          is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
+                     is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -98,30 +98,31 @@ struct DeviceOperationInstanceFactory<
        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                              is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_BF16
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
-                              is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -155,7 +155,7 @@ struct DeviceOperationInstanceFactory<
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<CDataType, float>)
+                     is_same_v<CDataType, float> && is_same_v<ComputeType, float>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -180,8 +180,8 @@ struct DeviceOperationInstanceFactory<
        }
 #endif
 #ifdef CK_ENABLE_FP16
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -206,8 +206,8 @@ struct DeviceOperationInstanceFactory<
        }
 #endif
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
-        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
@@ -230,8 +230,8 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
-                          is_same_v<CDataType, half_t>)
+                     is_same_v<CDataType, half_t> && is_same_v<ComputeType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
@@ -6,8 +6,6 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+using I32 = int32_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using namespace ck::tensor_layout::convolution;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+template <index_t NDSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_wmma_f16_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
+        //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
+        //#####################################|          |        |        |        |      |      |       |        |             |            |             |               |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |               |               |                             |                |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    64,    64,     4,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
+        // blocksize=256
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          256,   128,   256,     8,  8,    16,   16,       2,       8,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          256,   256,   128,     8,  8,    16,   16,       8,       2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          256,   256,    64,     8,  8,    16,   16,       4,       2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          256,    64,   256,     8,  8,    16,   16,       2,       4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        // blocksize=128
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    64,   128,     8,  8,    16,   16,       2,       4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   128,    64,     8,  8,    16,   16,       4,       2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   128,   128,     8,  8,    16,   16,       4,       4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    32,   256,     8,  8,    16,   16,       1,       8,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   256,    32,     8,  8,    16,   16,       8,       1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        // blocksize=64
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    64,    32,     8,  8,    16,   16,       4,       1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    32,    64,     8,  8,    16,   16,       1,       4,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    64,    64,     8,  8,    16,   16,       2,       4,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           64,   128,    32,     8,  8,    16,   16,       8,       1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    32,   128,     8,  8,    16,   16,       1,       8,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        // blocksize=32
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    16,    32,     8,  8,    16,   16,       1,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    16,    64,     8,  8,    16,   16,       1,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    64,     8,  8,    16,   16,       2,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    32,     8,  8,    16,   16,       2,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    32,     8,  8,    16,   16,       4,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,  8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    16,     8,  8,    16,   16,       2,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
+        // clang-format on
+        >;
+template <index_t NDSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_wmma_i8_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
+        //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
+        //#####################################|          |        |        |        |      |      |       |        |             |            |             |               |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |               |               |                             |                |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  I8,    I8,  I8,  I32,   PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    64,    64,     4,  8,     16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             1,             8,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
+        // blocksize=256
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          256,    64,   256,     8,   8,    16,   16,       2,       4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               2,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          256,   256,    64,     8,   8,    16,   16,       4,       2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            2,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        // blocksize=128
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   128,   256,     8,   8,    16,   16,       4,       8,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    64,   256,     8,   8,    16,   16,       2,       8,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               4,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    32,   256,     8,   8,    16,   16,       1,       8,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               2,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,    64,   128,     8,   8,    16,   16,       2,       4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               4,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   128,    64,     8,   8,    16,   16,       4,       2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   256,    32,     8,   8,    16,   16,       8,       1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   256,    64,     8,   8,    16,   16,       8,       2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               2>,      
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,          128,   256,   128,     8,   8,    16,   16,       8,       4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>, 
+        // blocksize=64
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    32,   128,     8,   8,    16,   16,       1,       8,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               4,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           64,    64,   128,     8,   8,    16,   16,       2,       8,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           64,   128,    64,     8,   8,    16,   16,       8,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           64,   128,    32,     8,   8,    16,   16,       8,       1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        // blocksize=32
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    16,    64,     8,   8,    16,   16,       1,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               4,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    64,     8,   8,    16,   16,       4,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    32,     8,   8,    16,   16,       2,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,   8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -163,6 +163,30 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
@@ -177,6 +201,31 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
                                                           PassThrough,
                                                           PassThrough>>>& instances);
 #endif
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
@@ -202,6 +251,30 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
@@ -231,6 +304,31 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_
                                                           BF8,
                                                           F8>>>& instances);
 #endif
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
 #ifdef DL_KERNELS
 // dl
@@ -529,8 +627,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
@@ -539,9 +637,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
@@ -552,8 +649,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NWGC> && is_same_v<WeiLayout, GKXC> &&
+            if constexpr(is_same_v<InLayout, NWGC> && is_same_v<WeiLayout, GKXC> &&
-                              is_same_v<OutLayout, NWGK>)
+                         is_same_v<OutLayout, NWGK>)
            {
 #ifdef DL_KERNELS
 #ifdef CK_ENABLE_FP32
@@ -564,16 +661,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
                    add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances(op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
                    add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances(
                        op_ptrs);
@@ -582,7 +678,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
            }
        }
-        else if constexpr(NumDimSpatial == 2)
+        if constexpr(NumDimSpatial == 2)
        {
            if constexpr(is_same_v<InLayout, GNHWC> && is_same_v<WeiLayout, GKYXC> &&
                         is_same_v<OutLayout, GNHWK>)
@@ -600,8 +696,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances(
@@ -612,9 +708,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
@@ -625,8 +720,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NHWGC> && is_same_v<WeiLayout, GKYXC> &&
+            if constexpr(is_same_v<InLayout, NHWGC> && is_same_v<WeiLayout, GKYXC> &&
-                              is_same_v<OutLayout, NHWGK>)
+                         is_same_v<OutLayout, NHWGK>)
            {
 #ifdef CK_ENABLE_FP32
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
@@ -641,8 +736,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances(
@@ -653,9 +748,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
@@ -667,7 +761,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
            }
        }
-        else if constexpr(NumDimSpatial == 3)
+        if constexpr(NumDimSpatial == 3)
        {
            if constexpr(is_same_v<InLayout, GNDHWC> && is_same_v<WeiLayout, GKZYXC> &&
                         is_same_v<OutLayout, GNDHWK>)
@@ -685,8 +779,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                             is_same_v<OutDataType, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances(
@@ -694,12 +788,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
@@ -708,10 +805,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
                        op_ptrs);
                }
+#endif
+#ifdef CK_ENABLE_INT8
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                                  is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
+                        op_ptrs);
+                }
 #endif
            }
-            else if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
+            if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
-                              is_same_v<OutLayout, NDHWGK>)
+                         is_same_v<OutLayout, NDHWGK>)
            {
 #ifdef CK_ENABLE_FP32
                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
@@ -726,10 +833,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                }
 #endif
 #ifdef CK_ENABLE_FP16
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
-                                  is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
-                                  is_same_v<ComputeTypeB, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
@@ -737,12 +843,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
-                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
-                                  is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t>)
-                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
@@ -752,10 +861,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                        op_ptrs);
                }
 #endif
+#ifdef CK_ENABLE_INT8
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                                  is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
+                        op_ptrs);
+                }
+#endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, bf8_t> &&
-                                  is_same_v<ComputeTypeA, bf8_t> && is_same_v<ComputeTypeB, f8_t>)
+                             is_same_v<ComputeTypeB, f8_t>)
                {
                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
                        op_ptrs);

--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -19,13 +19,13 @@ namespace instance {
 #ifdef CK_ENABLE_FP16
 // FP16
 void add_device_normalization_rank_2_1_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&);
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 2, 1>>>&);
 void add_device_normalization_rank_4_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&);
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 4, 3>>>&);
 void add_device_normalization_rank_5_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&);
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 5, 3>>>&);
 #endif
 #ifdef CK_ENABLE_FP32
 // FP32
@@ -42,14 +42,15 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          index_t Rank,
          index_t NumReduceDim>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormalization<
    XDataType,
    GammaDataType,
    BetaDataType,
-    F32,
    YDataType,
+    SaveMeanInvStdDataType,
    ck::tensor_operation::element_wise::PassThrough,
    Rank,
    NumReduceDim>>
@@ -57,8 +58,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
    using DeviceOp = DeviceNormalization<XDataType,
                                         GammaDataType,
                                         BetaDataType,
-                                         F32,
                                         YDataType,
+                                         SaveMeanInvStdDataType,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         Rank,
                                         NumReduceDim>;
@@ -68,7 +69,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F16> &&
-                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
+                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16> &&
+                     is_same_v<SaveMeanInvStdDataType, F32>)
        {
            if constexpr(Rank == 2 && NumReduceDim == 1)
            {
@@ -86,7 +88,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
 #endif
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
-                     is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
+                     is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32> &&
+                     is_same_v<SaveMeanInvStdDataType, F32>)
        {
            if constexpr(Rank == 2 && NumReduceDim == 1)
            {

--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -19,7 +19,7 @@ namespace instance {
 // FP16
 void add_device_normalization_rank_5_3_swish_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&);
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, Swish, 5, 3>>>&);
 // FP32
 void add_device_normalization_rank_5_3_swish_f32_instances(
@@ -27,20 +27,21 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
 // [x, gamma, beta, y] = [f16, f32, f32, f16]
 void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&);
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F16, F32, Swish, 5, 3>>>&);
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          index_t Rank,
          index_t NumReduceDim>
 struct DeviceOperationInstanceFactory<
    ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
-                                                      F32,
                                                      YDataType,
+                                                      SaveMeanInvStdDataType,
                                                      ck::tensor_operation::element_wise::Swish,
                                                      Rank,
                                                      NumReduceDim>>
@@ -48,8 +49,8 @@ struct DeviceOperationInstanceFactory<
    using DeviceOp = DeviceNormalization<XDataType,
                                         GammaDataType,
                                         BetaDataType,
-                                         F32,
                                         YDataType,
+                                         SaveMeanInvStdDataType,
                                         ck::tensor_operation::element_wise::Swish,
                                         Rank,
                                         NumReduceDim>;
@@ -59,7 +60,8 @@ struct DeviceOperationInstanceFactory<
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
        if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F16> &&
-                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
+                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16> &&
+                     is_same_v<SaveMeanInvStdDataType, F32>)
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {
@@ -67,7 +69,8 @@ struct DeviceOperationInstanceFactory<
            }
        }
        else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
-                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32> &&
+                          is_same_v<SaveMeanInvStdDataType, F32>)
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {
@@ -75,7 +78,8 @@ struct DeviceOperationInstanceFactory<
            }
        }
        else if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F32> &&
-                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F16>)
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F16> &&
+                          is_same_v<SaveMeanInvStdDataType, F32>)
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {

--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -230,7 +230,6 @@ check_err(const Range& out,
    return res;
 }
-#if defined CK_ENABLE_FP8
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, f8_t>),
@@ -275,9 +274,7 @@ check_err(const Range& out,
    }
    return res;
 }
-#endif
-#if defined CK_ENABLE_BF8
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
@@ -322,7 +319,6 @@ check_err(const Range& out,
    }
    return res;
 }
-#endif
 } // namespace utils
 } // namespace ck
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
@@ -22,7 +22,7 @@ static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNu
    std::ofstream outFile(fileName, std::ios::binary);
    if(outFile)
    {
-        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
+        outFile.write(reinterpret_cast<const char*>(data), dataNumItems * sizeof(T));
        outFile.close();
        std::cout << "Write output to file " << fileName << std::endl;
    }

--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -200,10 +200,11 @@ struct GeneratorTensor_3<ck::bf8_t>
 template <typename T>
 struct GeneratorTensor_4
 {
-    std::default_random_engine generator;
+    std::mt19937 generator;
    std::normal_distribution<float> distribution;
-    GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){};
+    GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
+        : generator(seed), distribution(mean, stddev){};
    template <typename... Is>
    T operator()(Is...)