Merge branch 'develop' into aosewski/gemm_tile_loop

ac76519a · Adam Osewski · GitHub · a70c6283 · 578142db · ac76519a
Unverified Commit ac76519a authored Aug 10, 2023 by Adam Osewski Committed by GitHub Aug 10, 2023
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -35,13 +35,17 @@ __global__ void
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
-template <typename GridwiseGemm, typename FloatAB, typename FloatC, bool HasMainKBlockLoop>
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
+        kernel_gemm_xdl_cshuffle_v1(const FloatA* __restrict__ p_a_grid,
-                                    const FloatAB* __restrict__ p_b_grid,
+                                    const FloatB* __restrict__ p_b_grid,
                                    FloatC* __restrict__ p_c_grid,
                                    typename GridwiseGemm::Problem problem)
 {
@@ -61,7 +65,8 @@ __global__ void
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatGemmAcc,
          typename FloatCShuffle,
          typename FloatC,
@@ -102,7 +107,8 @@ template <typename ALayout,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched,
-          PipelineVersion PipelineVer = PipelineVersion::v1>
+          PipelineVersion PipelineVer = PipelineVersion::v1,
+          typename ComputeType        = FloatC>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 {
    static constexpr auto I0 = Number<0>{};
@@ -463,8 +469,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    // Argument
    struct Argument : public tensor_operation::device::BaseArgument, public Problem
    {
-        __host__ Argument(const FloatAB* p_a_grid_,
+        __host__ Argument(const FloatA* p_a_grid_,
-                          const FloatAB* p_b_grid_,
+                          const FloatB* p_b_grid_,
                          FloatC* p_c_grid_,
                          index_t M_,
                          index_t N_,
@@ -479,8 +485,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        {
        }
-        const FloatAB* p_a_grid;
+        const FloatA* p_a_grid;
-        const FloatAB* p_b_grid;
+        const FloatB* p_b_grid;
        FloatC* p_c_grid;
    };
@@ -541,8 +547,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        constexpr auto c_block_size =
            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
-        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+        return math::max((a_block_space_size_aligned * sizeof(ComputeType) +
-                             sizeof(FloatAB),
+                          b_block_space_size_aligned * sizeof(ComputeType)),
                         c_block_size * sizeof(FloatCShuffle));
    }
@@ -676,8 +682,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    using Block2CTileMap = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+    __device__ static void Run(const FloatA* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
+                               const FloatB* __restrict__ p_b_grid,
                               FloatC* __restrict__ p_c_grid,
                               void* __restrict__ p_shared,
                               const Problem& problem)
@@ -743,8 +749,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                Sequence<AK0Number, MPerBlock, AK1Number>,
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
+                                                FloatA,
-                                                FloatAB,
+                                                ComputeType,
                                                decltype(a_grid_desc_ak0_m_ak1),
                                                decltype(a_block_desc_ak0_m_ak1),
                                                ABlockTransferSrcAccessOrder,
@@ -774,8 +780,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                Sequence<BK0Number, NPerBlock, BK1Number>,
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
+                                                FloatB,
-                                                FloatAB,
+                                                ComputeType,
                                                decltype(b_grid_desc_bk0_n_bk1),
                                                decltype(b_block_desc_bk0_n_bk1),
                                                BBlockTransferSrcAccessOrder,
@@ -805,11 +811,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        // sanity check
        constexpr index_t KPack =
            math::max(math::lcm(AK1Number, BK1Number),
-                      MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+                      MfmaSelector<ComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
-            FloatAB,
+            ComputeType,
            FloatGemmAcc,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),
@@ -827,10 +833,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ComputeType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            static_cast<ComputeType*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+// dinput descriptor in [N, C, Do, Ho, Wo] order
+// doutput descriptor in [N, C, Di, Hi, Wi] order
+// phyiscal layout is irrelavent
+template <ck::index_t NDimSpatial,
+          typename DInDataType,
+          typename DOutDataType,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceAvgPoolBwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(Tensor<DInDataType>& dinput,
+                 const Tensor<DOutDataType>& doutput,
+                 std::vector<ck::index_t> window_spatial_lengths,
+                 std::vector<ck::index_t> window_strides,
+                 std::vector<ck::index_t> window_dilations,
+                 std::vector<ck::index_t> dinput_left_pads,
+                 std::vector<ck::index_t> dinput_right_pads)
+            : dinput_{dinput},
+              doutput_{doutput},
+              window_spatial_lengths_{window_spatial_lengths},
+              window_strides_{window_strides},
+              window_dilations_{window_dilations},
+              in_left_pads_{dinput_left_pads},
+              in_right_pads_{dinput_right_pads}
+        {
+        }
+        Tensor<DInDataType>& dinput_;
+        const Tensor<DOutDataType>& doutput_;
+        std::vector<ck::index_t> window_spatial_lengths_;
+        std::vector<index_t> window_strides_;
+        std::vector<index_t> window_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+    };
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceAvgPoolBwd::Argument;
+        template <ck::index_t NDimSpatial_,
+                  typename std::enable_if<NDimSpatial_ == 1, bool>::type = false>
+        float RunAvgPoolBwd(const Argument& arg)
+        {
+            // Let input = x, outpu = y
+            // shape of x = [10], y = [6]
+            // window_size = 5, pad = 0, stride = 1, dilation = 1
+            // Forward:
+            // y0 = 1/5 * (x0 + x1 + x2 + x3 + x4)
+            // y1 = 1/5 * (x1 + x2 + x3 + x4 + x5)
+            // ...
+            // y5 = 1/5 * (x5 + x6 + x7 + x8 + x9)
+            // y6 = 1/5 * (x6 + x7 + x8 + x9)
+            // ...
+            // y9 = 1/5 * (x9)
+            // Backward:
+            // shape of dy = [6], dx = [10]
+            // dx0 = 1/5 * dy0
+            // dx1 = 1/5 * (dy0 + dy1)
+            // dx2 = 1/5 * (dy0 + dy1 + dy2)
+            // ...
+            // dx4 = 1/5 * (dy0 + dy1 + dy2 + dy3 + dy4)
+            // dx5 = 1/5 * (dy1 + dy2 + dy3 + dy4 + dy5)
+            // ...
+            // dx9 = 1/5 * (dy5 + dy6 + dy7 + dy8 + dy9)
+            auto f_ncw = [&](auto n, auto c, auto wi) {
+                std::size_t X  = arg.window_spatial_lengths_[0];
+                std::size_t Wo = arg.doutput_.GetLengths()[2];
+                float v_acc = 0;
+                for(std::size_t x = 0; x < X; ++x)
+                {
+                    // Out_Position = (In_Position + pad - x * dilation) / stride
+                    auto w_tmp = static_cast<ck::long_index_t>(wi) +
+                                 static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                 static_cast<ck::long_index_t>(x * arg.window_dilations_[0]);
+                    // Check the input pixel validity (in perspective of being affected by some
+                    // doutput pixel)
+                    if(w_tmp % arg.window_strides_[0] == 0)
+                    {
+                        auto wo = static_cast<ck::long_index_t>(w_tmp) /
+                                  static_cast<ck::long_index_t>(arg.window_strides_[0]);
+                        // Get the doutput pixel in valid range to accumulate the gradients for this
+                        // input pixel
+                        if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
+                        {
+                            v_acc += ck::type_convert<float>(arg.doutput_(n, c, wo));
+                        }
+                    }
+                }
+                v_acc /= ck::type_convert<float>(X);
+                arg.dinput_(n, c, wi) = ck::type_convert<DInDataType>(v_acc);
+            };
+            make_ParallelTensorFunctor(f_ncw,
+                                       arg.dinput_.GetLengths()[0],
+                                       arg.dinput_.GetLengths()[1],
+                                       arg.dinput_.GetLengths()[2])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        template <ck::index_t NDimSpatial_,
+                  typename std::enable_if<NDimSpatial_ == 2, bool>::type = false>
+        float RunAvgPoolBwd(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+                std::size_t Y = arg.window_spatial_lengths_[0];
+                std::size_t X = arg.window_spatial_lengths_[1];
+                std::size_t Ho = arg.doutput_.GetLengths()[2];
+                std::size_t Wo = arg.doutput_.GetLengths()[3];
+                float v_acc = 0;
+                for(std::size_t y = 0; y < Y; ++y)
+                {
+                    // Out_Position = (In_Position + pad - x * dilation) / stride
+                    auto h_tmp = static_cast<ck::long_index_t>(hi) +
+                                 static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                 static_cast<ck::long_index_t>(y * arg.window_dilations_[0]);
+                    // Check the input pixel validity (in perspective of being affected by some
+                    // doutput pixel)
+                    if(h_tmp % arg.window_strides_[0] == 0)
+                    {
+                        auto ho = static_cast<ck::long_index_t>(h_tmp) /
+                                  static_cast<ck::long_index_t>(arg.window_strides_[0]);
+                        // Get the doutput pixel in valid range to accumulate the gradients for this
+                        // input pixel
+                        if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
+                        {
+                            for(std::size_t x = 0; x < X; ++x)
+                            {
+                                auto w_tmp =
+                                    static_cast<ck::long_index_t>(wi) +
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                    static_cast<ck::long_index_t>(x * arg.window_dilations_[1]);
+                                if(w_tmp % arg.window_strides_[1] == 0)
+                                {
+                                    auto wo = static_cast<ck::long_index_t>(w_tmp) /
+                                              static_cast<ck::long_index_t>(arg.window_strides_[1]);
+                                    if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
+                                    {
+                                        v_acc +=
+                                            ck::type_convert<float>(arg.doutput_(n, c, ho, wo));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                v_acc /= ck::type_convert<float>(Y * X);
+                arg.dinput_(n, c, hi, wi) = ck::type_convert<DInDataType>(v_acc);
+            };
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.dinput_.GetLengths()[0],
+                                       arg.dinput_.GetLengths()[1],
+                                       arg.dinput_.GetLengths()[2],
+                                       arg.dinput_.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        template <ck::index_t NDimSpatial_,
+                  typename std::enable_if<NDimSpatial_ == 3, bool>::type = false>
+        float RunAvgPoolBwd(const Argument& arg)
+        {
+            auto f_ncdhw = [&](auto n, auto c, auto di, auto hi, auto wi) {
+                std::size_t Z = arg.window_spatial_lengths_[0];
+                std::size_t Y = arg.window_spatial_lengths_[1];
+                std::size_t X = arg.window_spatial_lengths_[2];
+                std::size_t Do = arg.doutput_.GetLengths()[2];
+                std::size_t Ho = arg.doutput_.GetLengths()[3];
+                std::size_t Wo = arg.doutput_.GetLengths()[4];
+                float v_acc = 0;
+                for(std::size_t z = 0; z < Z; ++z)
+                {
+                    // Out_Position = (In_Position + pad - x * dilation) / stride
+                    auto d_tmp = static_cast<ck::long_index_t>(di) +
+                                 static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                 static_cast<ck::long_index_t>(z * arg.window_dilations_[0]);
+                    // Check the input pixel validity (in perspective of being affected by some
+                    // doutput pixel)
+                    if(d_tmp % arg.window_strides_[0] == 0)
+                    {
+                        auto do_ = static_cast<ck::long_index_t>(d_tmp) /
+                                   static_cast<ck::long_index_t>(arg.window_strides_[0]);
+                        // Get the doutput pixel in valid range to accumulate the gradients for this
+                        // input pixel
+                        if(do_ >= 0 && ck::type_convert<std::size_t>(do_) < Do)
+                        {
+                            for(std::size_t y = 0; y < Y; ++y)
+                            {
+                                auto h_tmp =
+                                    static_cast<ck::long_index_t>(hi) +
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                    static_cast<ck::long_index_t>(y * arg.window_dilations_[1]);
+                                if(h_tmp % arg.window_strides_[1] == 0)
+                                {
+                                    auto ho = static_cast<ck::long_index_t>(h_tmp) /
+                                              static_cast<ck::long_index_t>(arg.window_strides_[1]);
+                                    if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
+                                    {
+                                        for(std::size_t x = 0; x < X; ++x)
+                                        {
+                                            auto w_tmp = static_cast<ck::long_index_t>(wi) +
+                                                         static_cast<ck::long_index_t>(
+                                                             arg.in_left_pads_[2]) -
+                                                         static_cast<ck::long_index_t>(
+                                                             x * arg.window_dilations_[2]);
+                                            if(w_tmp % arg.window_strides_[2] == 0)
+                                            {
+                                                auto wo = static_cast<ck::long_index_t>(w_tmp) /
+                                                          static_cast<ck::long_index_t>(
+                                                              arg.window_strides_[2]);
+                                                if(wo >= 0 &&
+                                                   ck::type_convert<std::size_t>(wo) < Wo)
+                                                {
+                                                    v_acc += ck::type_convert<float>(
+                                                        arg.doutput_(n, c, do_, ho, wo));
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                v_acc /= ck::type_convert<float>(Z * Y * X);
+                arg.dinput_(n, c, di, hi, wi) = ck::type_convert<DInDataType>(v_acc);
+            };
+            make_ParallelTensorFunctor(f_ncdhw,
+                                       arg.dinput_.GetLengths()[0],
+                                       arg.dinput_.GetLengths()[1],
+                                       arg.dinput_.GetLengths()[2],
+                                       arg.dinput_.GetLengths()[3],
+                                       arg.dinput_.GetLengths()[4])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        float Run(const Argument& arg)
+        {
+            if(!(arg.dinput_.GetNumOfDimension() == NDimSpatial + 2 &&
+                 arg.doutput_.GetNumOfDimension() == NDimSpatial + 2))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+            return RunAvgPoolBwd<NDimSpatial>(arg);
+        }
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+    static auto MakeArgument(Tensor<DInDataType>& dinput,
+                             const Tensor<DOutDataType>& doutput,
+                             std::vector<ck::index_t> window_spatial_lengths,
+                             std::vector<ck::index_t> window_strides,
+                             std::vector<ck::index_t> window_dilations,
+                             std::vector<ck::index_t> dinput_left_pads,
+                             std::vector<ck::index_t> dinput_right_pads)
+    {
+        if(window_spatial_lengths.size() != NDimSpatial || window_strides.size() != NDimSpatial ||
+           window_dilations.size() != NDimSpatial || dinput_left_pads.size() != NDimSpatial ||
+           dinput_right_pads.size() != NDimSpatial)
+            throw std::runtime_error("dimension is incorrect");
+        return Argument{dinput,
+                        doutput,
+                        window_spatial_lengths,
+                        window_strides,
+                        window_dilations,
+                        dinput_left_pads,
+                        dinput_right_pads};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceAvgPoolBwd"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -125,7 +125,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
                    arg.in_element_op_(v_in, v_acc);
-                    arg.input_(g, n, c, wi) = ck::type_convert<InDataType>(v_acc);
+                    arg.input_(g, n, c, wi) = ck::type_convert<InDataType>(v_in);
                };
                make_ParallelTensorFunctor(f_ncw,
@@ -201,7 +201,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
                    arg.in_element_op_(v_in, v_acc);
-                    arg.input_(g, n, c, hi, wi) = ck::type_convert<InDataType>(v_acc);
+                    arg.input_(g, n, c, hi, wi) = ck::type_convert<InDataType>(v_in);
                };
                make_ParallelTensorFunctor(f_nchw,
@@ -299,7 +299,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
                    arg.in_element_op_(v_in, v_acc);
-                    arg.input_(g, n, c, di, hi, wi) = ck::type_convert<InDataType>(v_acc);
+                    arg.input_(g, n, c, di, hi, wi) = ck::type_convert<InDataType>(v_in);
                };
                make_ParallelTensorFunctor(f_ncdhw,

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __bf16__
 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
@@ -36,7 +36,8 @@ void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef __fp16__
 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -56,7 +57,8 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef __fp32__
 void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
@@ -76,7 +78,8 @@ void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef __int8__
 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
                                                  Row,
@@ -120,7 +123,7 @@ void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -151,7 +154,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp32__
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<CDataType, float>)
        {
@@ -176,7 +179,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+#endif
+#ifdef __fp16__
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<CDataType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
@@ -200,7 +205,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+#endif
+#ifdef __bf16__
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
                     is_same_v<CDataType, bhalf_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
@@ -224,7 +231,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+#endif
+#ifdef __int8__
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                     is_same_v<CDataType, int8_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
@@ -248,7 +257,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -14,7 +14,7 @@
 using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
 using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -137,3 +137,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -91,3 +91,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp16__
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -58,7 +58,8 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
+#ifdef __bf16__
 void add_device_batched_gemm_bias_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -100,7 +101,7 @@ void add_device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
 template <typename ADataType,
          typename B0DataType,
          typename B1DataType,
@@ -147,7 +148,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp16__
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t> &&
                     Acc0BiasDataType::Size() == 1 &&
@@ -164,6 +165,8 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
+#ifdef __bf16__
        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16> &&
                          Acc0BiasDataType::Size() == 1 &&
@@ -180,6 +183,7 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp16__
 void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
                                                      Col,
@@ -111,3 +111,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
@@ -19,7 +19,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp16__
 void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -123,7 +123,8 @@ void add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instan
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
+#endif
+#ifdef __int8__
 void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
                                                        Row,
@@ -227,7 +228,7 @@ void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
                                                        PassThrough,
                                                        PassThrough,
                                                        PassThrough>>>& instances);
+#endif
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -262,7 +263,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp16__
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<EDataType, half_t>)
        {
@@ -295,6 +296,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
+#endif
+#ifdef __int8__
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<EDataType, int8_t>)
        {
@@ -327,7 +330,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -119,3 +119,4 @@ struct DeviceOperationInstanceFactory<
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp16__
 void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -58,7 +58,8 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_g
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
+#ifdef __bf16__
 void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
    std::vector<std::unique_ptr<
        DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -100,6 +101,7 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf
                                                            PassThrough,
                                                            MaskingSpecialization::MaskDisabled>>>&
        instances);
+#endif
 template <typename ADataType,
          typename B0DataType,
@@ -146,7 +148,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp16__
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
        {
@@ -161,6 +163,8 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
+#ifdef __bf16__
        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16>)
        {
@@ -175,6 +179,7 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp32__
 // float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -65,7 +65,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           PassThrough,
                                                           PassThrough,
                                                           Bilinear>>>& instances);
+#endif
+#ifdef __fp64__
 // double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,7 +115,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           PassThrough,
                                                           PassThrough,
                                                           Bilinear>>>& instances);
+#endif
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -149,7 +150,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp32__
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
        {
@@ -165,7 +166,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
+#ifdef __fp64__
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
        {
@@ -181,7 +183,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __fp32__
 // float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -65,7 +65,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           PassThrough,
                                                           PassThrough,
                                                           Scale>>>& instances);
+#endif
+#ifdef __fp64__
 // double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,7 +115,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           PassThrough,
                                                           PassThrough,
                                                           Scale>>>& instances);
+#endif
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
@@ -148,7 +149,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#ifdef __fp32__
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<EDataType, float>)
        {
@@ -164,7 +165,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
+#ifdef __fp64__
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
@@ -180,7 +182,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                    op_ptrs);
            }
        }
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -16,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+#ifdef __bf16__
 // conv1d backward data
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<1,
@@ -29,16 +29,19 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp16__
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
    std::vector<std::unique_ptr<
        DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef __fp32__
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
    std::vector<std::unique_ptr<
        DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
 #ifdef __int8__
 void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<1,
@@ -52,6 +55,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#ifdef __bf16__
 // conv2d backward data
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -64,7 +68,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp16__
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -76,7 +81,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp32__
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -88,6 +94,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 #ifdef __int8__
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -101,6 +108,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#ifdef DL_KERNELS
+#ifdef __fp16__
 // conv2d dl
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -113,7 +122,8 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp32__
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
                                                  NHWC,
@@ -125,6 +135,7 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 #ifdef __int8__
 void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<2,
@@ -138,6 +149,8 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#endif
+#ifdef __bf16__
 // conv3d backward data
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
@@ -150,7 +163,8 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp16__
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -162,7 +176,8 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
+#ifdef __fp32__
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
                                                  NDHWC,
@@ -174,6 +189,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
                                                  PassThrough,
                                                  PassThrough,
                                                  PassThrough>>>& instances);
+#endif
 #ifdef __int8__
 void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvBwdData<3,
@@ -229,19 +245,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef __fp16__
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef __bf16__
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(op_ptrs);
            }
+#endif
 #ifdef __int8__
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(op_ptrs);
@@ -255,26 +274,35 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+#endif
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef __fp16__
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+#endif
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef __bf16__
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
+#endif
 #ifdef __int8__
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+#ifdef DL_KERNELS
                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+#endif
            }
 #endif
        }
@@ -286,19 +314,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+#ifdef __fp16__
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+#endif
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
+#ifdef __bf16__
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(op_ptrs);
            }
+#endif
 #ifdef __int8__
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(op_ptrs);

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -18,11 +18,17 @@ namespace device {
 namespace instance {
 // conv2d forward
+#ifdef __fp16__
 void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
    std::vector<std::unique_ptr<
        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef __bf16__
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvFwd<2,
                                              NHWC,
@@ -34,17 +40,14 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                              PassThrough,
                                              PassThrough,
                                              PassThrough>>>& instances);
+#endif
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+#ifdef __fp32__
-    std::vector<std::unique_ptr<
-        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
    std::vector<std::unique_ptr<
        DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+#endif
+#ifdef __int8__
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceConvFwd<2,
                                              NHWC,
@@ -56,6 +59,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
                                              PassThrough,
                                              PassThrough,
                                              PassThrough>>>& instances);
+#endif
 template <ck::index_t NumDimSpatial,
          typename InLayout,
@@ -99,23 +103,29 @@ struct DeviceOperationInstanceFactory<
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#ifdef __fp16__
            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                              is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
            }
+#endif
+#ifdef __bf16__
            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                              is_same_v<WeiDataType, ck::bhalf_t> &&
                              is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
+#endif
+#ifdef __int8__
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                              is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }
+#endif
        }
        return op_ptrs;

--- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -77,3 +77,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElemen
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -343,6 +343,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs);
            }
        }
+#ifdef __fp16__
        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                          is_same_v<CDataType, half_t>)
        {
@@ -388,6 +389,8 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
+#endif
+#ifdef __bf16__
        else if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
                          is_same_v<CDataType, ck::bhalf_t>)
        {
@@ -412,6 +415,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs);
            }
        }
+#endif
 #ifdef __int8__
        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
                          is_same_v<CDataType, int8_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp
@@ -9,7 +9,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -170,3 +170,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef __fp16__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -144,3 +144,4 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif