Merge remote-tracking branch 'origin/develop' into contraction

7a3b49e5 · Chao Liu · e07b3d8e · d3051d75 · 7a3b49e5 · 7a3b49e5
Commit 7a3b49e5 authored Jun 25, 2022 by Chao Liu
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -27,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -39,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
-#ifndef CHECK_ERR_HPP
+// SPDX-License-Identifier: MIT
-#define CHECK_ERR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <algorithm>
+#pragma once
-#include <cmath>
-#include <cstdlib>
+#include <algorithm>
-#include <half.hpp>
+#include <cmath>
-#include <iostream>
+#include <cstdlib>
-#include <iomanip>
+#include <iostream>
-#include <iterator>
+#include <iomanip>
-#include <limits>
+#include <iterator>
-#include <type_traits>
+#include <limits>
-#include <vector>
+#include <type_traits>
+#include <vector>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-namespace ck {
-namespace utils {
+namespace ck {
+namespace utils {
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
-check_err(const std::vector<T>& out,
+                        bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-5,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 3e-6)
+          double rtol            = 1e-5,
-{
+          double atol            = 3e-6)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count  = 0;
+    bool res{true};
-    double err     = 0;
+    int err_count  = 0;
-    double max_err = std::numeric_limits<double>::min();
+    double err     = 0;
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<double>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        err = std::abs(out[i] - ref[i]);
+    {
-        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
+        err = std::abs(out[i] - ref[i]);
-        {
+        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
+template <typename T>
-check_err(const std::vector<T>& out,
+typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-3,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 1e-3)
+          double rtol            = 1e-3,
-{
+          double atol            = 1e-3)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count = 0;
+    bool res{true};
-    double err    = 0;
+    int err_count = 0;
-    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double err    = 0;
-    double max_err = std::numeric_limits<float>::min();
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<float>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        double o = type_convert<float>(out[i]);
+    {
-        double r = type_convert<float>(ref[i]);
+        double o = type_convert<float>(out[i]);
-        err      = std::abs(o - r);
+        double r = type_convert<float>(ref[i]);
-        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        err      = std::abs(o - r);
-        {
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << o << " != " << r << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << o << " != " << r << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double rtol            = 1e-3,
          double atol            = 1e-3)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
    bool res{true};
    int err_count  = 0;
    double err     = 0;
    double max_err = std::numeric_limits<T>::min();
    for(std::size_t i = 0; i < ref.size(); ++i)
    {
        double o = type_convert<float>(out[i]);
        double r = type_convert<float>(ref[i]);
        err      = std::abs(o - r);
        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
            if(err_count < 5)
            {
                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
                          << i << "]: " << o << " != " << r << std::endl
                          << msg << std::endl;
            }
            res = false;
        }
    }
    if(!res)
    {
        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
    }
    return res;
 }
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double                 = 0,
          double                 = 0)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    bool res{true};
-    {
+    int err_count   = 0;
-        if(out[i] != ref[i])
+    int64_t err     = 0;
-        {
+    int64_t max_err = std::numeric_limits<int64_t>::min();
-            std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
+    for(std::size_t i = 0; i < ref.size(); ++i)
-                      << " != " << static_cast<int>(ref[i]) << std::endl
+    {
-                      << msg << std::endl;
+        int64_t o = out[i];
-            return false;
+        int64_t r = ref[i];
-        }
+        err       = std::abs(o - r);
-    }
-    return true;
+        if(err > 0)
-}
+        {
+            max_err = err > max_err ? err : max_err;
-} // namespace utils
+            err_count++;
-} // namespace ck
+            if(err_count < 5)
+            {
-template <typename T>
+                std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+                          << " != " << static_cast<int>(ref[i]) << std::endl
-{
+                          << msg << std::endl;
-    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+            }
-    return os;
+            res = false;
-}
+        }
+    }
-#endif
+    if(!res)
+    {
+        std::cout << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+} // namespace utils
+} // namespace ck
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
--- a/library/include/ck/library/utility/conv_util.hpp
+++ b/library/include/ck/library/utility/conv_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
@@ -9,17 +12,17 @@
 #include <type_traits>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/ck.hpp"
-#include "config.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "device.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
-#include "device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "fill.hpp"
+#include "ck/library/utility/fill.hpp"
-#include "host_tensor.hpp"
+#include "ck/library/utility/op_instance_engine.hpp"
-#include "op_instance_engine.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "reference_conv_fwd.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "tensor_layout.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -402,8 +405,8 @@ template <typename InDataType,
          typename InElementwiseOp  = ck::tensor_operation::element_wise::PassThrough,
          typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
          typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
-          typename InputInitFun     = FillUniform<InDataType>,
+          typename InputInitFun     = FillUniformDistribution<InDataType>,
-          typename WeightsInitFun   = FillUniform<WeiDataType>>
+          typename WeightsInitFun   = FillUniformDistribution<WeiDataType>>
 class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
 {
    using DeviceConvFwdOp = tensor_operation::device::
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    ConvFwdOpInstance(const ConvParams& params,
                      bool do_init                         = true,
-                      const InputInitFun& input_init_f     = InputInitFun{},
+                      const InputInitFun& input_init_f     = InputInitFun(),
-                      const WeightsInitFun& weights_init_f = WeightsInitFun{})
+                      const WeightsInitFun& weights_init_f = WeightsInitFun())
        : BaseType(),
          params_{params},
          output_spatial_lengths_{params.GetOutputSpatialLengths()},
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    const ConvParams& params_;
    const std::vector<ck::index_t> output_spatial_lengths_;
    const bool do_init_;
-    const InputInitFun& input_init_f_;
+    InputInitFun input_init_f_;
-    const WeightsInitFun& weights_init_f_;
+    WeightsInitFun weights_init_f_;
 };
 } // namespace conv

--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <algorithm>
+#include <cmath>
 #include <random>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 namespace ck {
 namespace utils {
-// template <typename T, class Enable = void>
+template <typename T>
-// struct FillUniform;
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};
-// TODO: what's wrong with this specialization???
+    template <typename ForwardIter>
-// err: segmentation fault in mt19937 - infinite loop like.
+    void operator()(ForwardIter first, ForwardIter last) const
-// template <typename T>
+    {
-// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+        std::mt19937 gen(11939);
-//                                               !std::is_same<T, bhalf_t>::value>::type>
+        std::uniform_real_distribution<float> dis(a_, b_);
-// {
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
-//     int a_{0};
+    }
-//     int b_{5};
+};
-//     // T a_ = T{0};
-//     // T b_ = T{5};
-//     template <typename ForwardIter>
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
-//     void operator()(ForwardIter first, ForwardIter last) const
+// However this produces segfaults in std::mt19937 which look like inifite loop.
-//     {
+//      template <typename T>
-//         std::mt19937 gen{11939};
+//      struct FillUniformDistributionIntegerValue
-//         std::uniform_int_distribution<int> dis(a_, b_);
+//      {
-//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          int a_{-5};
-//     }
+//          int b_{5};
-// };
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          }
+//      };
-// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+// Workaround for uniform_int_distribution not working as expected. See note above.<
-//                                               std::is_same<T, bhalf_t>::value>::type>
 template <typename T>
-struct FillUniform
+struct FillUniformDistributionIntegerValue
 {
-    float a_{0};
+    float a_{-5.f};
-    float b_{5};
+    float b_{5.f};
    template <typename ForwardIter>
    void operator()(ForwardIter first, ForwardIter last) const
    {
-        std::mt19937 gen{11939};
+        std::mt19937 gen(11939);
-        std::uniform_real_distribution<> dis(a_, b_);
+        std::uniform_real_distribution<float> dis(a_, b_);
-        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+        std::generate(
+            first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
    }
 };

--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <stdexcept>
@@ -8,9 +12,12 @@
 #include <utility>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/utility/functional2.hpp"
-#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "functional2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace utils {
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
    template <typename ReferenceOp = std::function<void()>>
    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{})
+                        const ReferenceOp& reference_op = ReferenceOp{},
+                        bool do_verification            = true)
        : op_instance_{op_instance}
    {
        in_tensors_ = op_instance_.GetInputTensors();
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
                                         const Tensor<InArgTypes>&...,
                                         Tensor<OutDataType>&>)
        {
-            ref_output_ = op_instance_.GetOutputTensor();
+            if(do_verification)
-            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            {
+                ref_output_ = op_instance_.GetOutputTensor();
+                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            }
        }
        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
        out_device_buffer_ =
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
                op_ptr.get(), in_device_buffers_, out_device_buffer_);
            if(op_ptr->IsSupportedArgument(argument.get()))
            {
+                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
                invoker->Run(argument.get());
                out_device_buffer_->FromDevice(out_tensor_->mData.data());
                if(!ref_output_)
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
                        " You have to provide reference function.");
                }
                // TODO: enable flexible use of custom check_error functions
-                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
+                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
+                res = res && inst_res;
                out_device_buffer_->SetZero();
            }
+            else
+            {
+                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
+                          << op_ptr->GetTypeString() << std::endl;
+            }
        }
        return res;
    }
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
                              bool do_verification = false,
                              bool do_log          = false)
    {
-        bool res{true};
        ProfileBestConfig best_config;
        for(auto& op_ptr : op_ptrs)
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                          << " GB/s, " << op_name << std::endl;
-                if(tflops < best_config.best_tflops)
+                if(avg_time < best_config.best_avg_time)
                {
                    best_config.best_op_name    = op_name;
                    best_config.best_tflops     = tflops;
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
                            " You have to provide reference function.");
                    }
                    // TODO: enable flexible use of custom check_error functions
-                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+                    CheckErr(out_tensor_->mData, ref_output_->mData);
                    if(do_log) {}
                }
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
    template <typename T>
    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
    }
 };

--- a/library/src/host_tensor/CMakeLists.txt
+++ b/library/src/host_tensor/CMakeLists.txt
 ## host_tensor
-include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}/include/ck
-    ${PROJECT_SOURCE_DIR}/include/ck/utility
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
-)
 set(HOST_TENSOR_SOURCE
-    device.cpp
+    device_memory.cpp
    host_tensor.cpp
 )

--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
-#include "device.hpp"
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/device_utility/hip_check_error.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 {
@@ -22,49 +26,3 @@ void DeviceMem::FromDevice(void* p)
 void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
 DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
-struct KernelTimerImpl
-{
-    KernelTimerImpl()
-    {
-        hip_check_error(hipEventCreate(&mStart));
-        hip_check_error(hipEventCreate(&mEnd));
-    }
-    ~KernelTimerImpl()
-    {
-        hip_check_error(hipEventDestroy(mStart));
-        hip_check_error(hipEventDestroy(mEnd));
-    }
-    void Start()
-    {
-        hip_check_error(hipDeviceSynchronize());
-        hip_check_error(hipEventRecord(mStart, nullptr));
-    }
-    void End()
-    {
-        hip_check_error(hipEventRecord(mEnd, nullptr));
-        hip_check_error(hipEventSynchronize(mEnd));
-    }
-    float GetElapsedTime() const
-    {
-        float time;
-        hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
-        return time;
-    }
-    hipEvent_t mStart, mEnd;
-};
-KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
-KernelTimer::~KernelTimer() {}
-void KernelTimer::Start() { impl->Start(); }
-void KernelTimer::End() { impl->End(); }
-float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
--- a/library/src/host_tensor/host_tensor.cpp
+++ b/library/src/host_tensor/host_tensor.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <cassert>
-#include "host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 void HostTensorDescriptor::CalculateStrides()
 {

--- a/library/src/obselete_driver_offline/CMakeLists.txt
+++ b/library/src/obselete_driver_offline/CMakeLists.txt
-include_directories(BEFORE
-    include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/device/include
-    ${PROJECT_SOURCE_DIR}/host/solver/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-)
-set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
-set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp)
-set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp)
-set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp)
-set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
-set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
-set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
-add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
-add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
-add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
-add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
-add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
-add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
-add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
-target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor)
-target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor)
-target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor)
-target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
-target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
-target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
--- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
-#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V5R1_NCHWC 1
-enum ConvForwardAlgo
-{
-    V5R1NCHWC // 0
-};
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
-                                       const Tensor<TWei>& wei,
-                                       const Tensor<TOut>& add,
-                                       const Tensor<TOut>& bias,
-                                       Tensor<TOut>& add_host,
-                                       Tensor<TOut>& out_host,
-                                       const ConvStrides& conv_strides,
-                                       const ConvDilations& conv_dilations,
-                                       const InLeftPads& in_left_pads,
-                                       const InRightPads&,
-                                       const ck::ActivTypeEnum activ_type)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
-        double v = 0;
-        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
-        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
-                        {
-                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
-                                 static_cast<const double>(wei(k, c0, y, x, c1));
-                        }
-                    }
-                }
-            }
-        }
-        v += bias(k0, k1);
-        v = activ(v, activ_type);
-        const int hox2 = ho * 2;
-        const int wox2 = wo * 2;
-        out_host(n, k0, ho, wo, k1) = v;
-        add_host(n, k0, hox2, wox2, k1)         = v + add(n, k0, hox2, wox2, k1);
-        add_host(n, k0, hox2, wox2 + 1, k1)     = v + add(n, k0, hox2, wox2 + 1, k1);
-        add_host(n, k0, hox2 + 1, wox2, k1)     = v + add(n, k0, hox2 + 1, wox2, k1);
-        add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
-    };
-    make_ParallelTensorFunctor(f_nchw,
-                               out_host.mDesc.GetLengths()[0],
-                               out_host.mDesc.GetLengths()[1],
-                               out_host.mDesc.GetLengths()[2],
-                               out_host.mDesc.GetLengths()[3],
-                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-#if USE_DYNAMIC_MODE
-    // dynamic mode
-    if(argc != 23)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    const index_t N  = std::stoi(argv[6]);
-    const index_t K0 = std::stoi(argv[7]);
-    const index_t K1 = std::stoi(argv[8]);
-    const index_t C0 = std::stoi(argv[9]);
-    const index_t C1 = std::stoi(argv[10]);
-    const index_t Y  = std::stoi(argv[11]);
-    const index_t X  = std::stoi(argv[12]);
-    const index_t Hi = std::stoi(argv[13]);
-    const index_t Wi = std::stoi(argv[14]);
-    const index_t conv_stride_h   = std::stoi(argv[15]);
-    const index_t conv_stride_w   = std::stoi(argv[16]);
-    const index_t conv_dilation_h = std::stoi(argv[17]);
-    const index_t conv_dilation_w = std::stoi(argv[18]);
-    const index_t in_left_pad_h   = std::stoi(argv[19]);
-    const index_t in_left_pad_w   = std::stoi(argv[20]);
-    const index_t in_right_pad_h  = std::stoi(argv[21]);
-    const index_t in_right_pad_w  = std::stoi(argv[22]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-    const auto Hox2 = Ho * 2;
-    const auto Wox2 = Wo * 2;
-#else
-    // static mode
-    if(argc < 6)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-#if 0
-    constexpr auto N             = Number<1>{};
-    constexpr auto Hi            = Number<1080>{};
-    constexpr auto Wi            = Number<1920>{};
-    constexpr auto Y             = Number<3>{};
-    constexpr auto X             = Number<3>{};
-    constexpr auto C0            = Number<2>{};
-    constexpr auto C1            = Number<8>{};
-    constexpr auto K1            = Number<8>{};
-    constexpr auto K0            = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<540>{};
-    constexpr auto Wi = Number<960>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<270>{};
-    constexpr auto Wi = Number<480>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 1
-    constexpr auto N  = Number<128>{};
-    constexpr auto Hi = Number<135>{};
-    constexpr auto Wi = Number<240>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 1
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<32>{};
-    constexpr auto Wi = Number<32>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K1 = Number<8>{};
-    constexpr auto K0 = Number<8>{};
-#endif
-    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I1;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-    constexpr auto Hox2 = Number<Ho * 2>{};
-    constexpr auto Wox2 = Number<Wo * 2>{};
-#endif
-#if 0
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 1
-    using in_data_t     = half_t;
-    using acc_data_t    = float;
-    using out_data_t    = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
-        add_lengths_host(5), bias_lengths_host(2);
-    in_lengths_host[0] = static_cast<std::size_t>(N);
-    in_lengths_host[1] = static_cast<std::size_t>(C0);
-    in_lengths_host[2] = static_cast<std::size_t>(Hi);
-    in_lengths_host[3] = static_cast<std::size_t>(Wi);
-    in_lengths_host[4] = static_cast<std::size_t>(C1);
-    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
-    wei_lengths_host[1] = static_cast<std::size_t>(C0);
-    wei_lengths_host[2] = static_cast<std::size_t>(Y);
-    wei_lengths_host[3] = static_cast<std::size_t>(X);
-    wei_lengths_host[4] = static_cast<std::size_t>(C1);
-    out_lengths_host[0] = static_cast<std::size_t>(N);
-    out_lengths_host[1] = static_cast<std::size_t>(K0);
-    out_lengths_host[2] = static_cast<std::size_t>(Ho);
-    out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    out_lengths_host[4] = static_cast<std::size_t>(K1);
-    add_lengths_host[0] = static_cast<std::size_t>(N);
-    add_lengths_host[1] = static_cast<std::size_t>(K0);
-    add_lengths_host[2] = static_cast<std::size_t>(Hox2);
-    add_lengths_host[3] = static_cast<std::size_t>(Wox2);
-    add_lengths_host[4] = static_cast<std::size_t>(K1);
-    bias_lengths_host[0] = static_cast<std::size_t>(K0);
-    bias_lengths_host[1] = static_cast<std::size_t>(K1);
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<in_data_t> add(add_lengths_host);
-    Tensor<in_data_t> add_device(add_lengths_host);
-    Tensor<in_data_t> add_host(add_lengths_host);
-    Tensor<out_data_t> bias(bias_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-    add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-    auto f_make_for_device_nchwc = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
-        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
-        const auto add_lengths_dev    = make_tuple(N, K0, Hox2, Wox2, K1);
-        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          add_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-#if USE_CONV_FWD_V5R1_NCHWC
-    if(algo == ConvForwardAlgo::V5R1NCHWC)
-    {
-        const auto tmp = f_make_for_device_nchwc();
-        device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
-                                                                                        acc_data_t,
-                                                                                        out_data_t,
-                                                                                        activ_type>(
-            tmp[I0], // in_lengths_dev
-            tmp[I1], // wei_lengths_dev
-            tmp[I2], // add_lengths_dev
-            tmp[I3], // out_lengths_dev
-            tmp[I4], // conv_strides_dev
-            tmp[I5], // conv_dilations_dev
-            tmp[I6], // in_left_pads_dev
-            tmp[I7], // in_right_pads_dev
-            in,
-            wei,
-            bias,
-            add,
-            add_device,
-            nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_direct_convolution_add_nchwc(in,
-                                          wei,
-                                          add,
-                                          bias,
-                                          add_host,
-                                          out_host,
-                                          make_tuple(conv_stride_h, conv_stride_w),
-                                          make_tuple(conv_dilation_h, conv_dilation_w),
-                                          make_tuple(in_left_pad_h, in_left_pad_w),
-                                          make_tuple(in_right_pad_h, in_right_pad_w),
-                                          activ_type);
-        ck::utils::check_err(add_device.mData, add_host.mData);
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
-        }
-    }
-}
--- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp"
-#define USE_MODE 1
-#define USE_CONV_BWD_V4R1_XDL_NHWC 0
-#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
-enum ConvTensorLayout
-{
-    NCHW,
-    NHWC,
-    CHWN,
-    NCHWc,
-    NHWCc
-};
-enum ConvBackwardDataAlgo
-{
-    V4R1XDLNHWC,   // 0
-    V4R1R2XDLNHWC, // 1
-};
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_convolution_backward_data(Tensor<TIn>& in,
-                                    const Tensor<TWei>& wei,
-                                    const Tensor<TOut>& out,
-                                    const ConvStrides& conv_strides,
-                                    const ConvDilations& conv_dilations,
-                                    const InLeftPads& in_left_pads,
-                                    const InRightPads& /* in_right_pads */,
-                                    const ConvTensorLayout layout = ConvTensorLayout::NCHW)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t K = wei.mDesc.GetLengths()[I0];
-        std::size_t Y = wei.mDesc.GetLengths()[I2];
-        std::size_t X = wei.mDesc.GetLengths()[I3];
-        std::size_t Ho = out.mDesc.GetLengths()[I2];
-        std::size_t Wo = out.mDesc.GetLengths()[I3];
-        double v = 0;
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
-            if(h_tmp % conv_strides[I0] == 0)
-            {
-                int ho = h_tmp / conv_strides[I0];
-                if(ho >= 0 && ho < Ho)
-                {
-                    for(int x = 0; x < X; ++x)
-                    {
-                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
-                        if(w_tmp % conv_strides[I1] == 0)
-                        {
-                            int wo = w_tmp / conv_strides[I1];
-                            if(wo >= 0 && wo < Wo)
-                            {
-                                for(int k = 0; k < K; ++k)
-                                {
-                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        in(n, c, hi, wi) = v;
-    };
-    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t K = wei.mDesc.GetLengths()[I0];
-        std::size_t Y = wei.mDesc.GetLengths()[I1];
-        std::size_t X = wei.mDesc.GetLengths()[I2];
-        std::size_t Ho = out.mDesc.GetLengths()[I1];
-        std::size_t Wo = out.mDesc.GetLengths()[I2];
-        double v = 0;
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
-            if(h_tmp % conv_strides[I0] == 0)
-            {
-                int ho = h_tmp / conv_strides[I0];
-                if(ho >= 0 && ho < Ho)
-                {
-                    for(int x = 0; x < X; ++x)
-                    {
-                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
-                        if(w_tmp % conv_strides[I1] == 0)
-                        {
-                            int wo = w_tmp / conv_strides[I1];
-                            if(wo >= 0 && wo < Wo)
-                            {
-                                for(int k = 0; k < K; ++k)
-                                {
-                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        in(n, hi, wi, c) = v;
-    };
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        make_ParallelTensorFunctor(f_nchw,
-                                   in.mDesc.GetLengths()[0],
-                                   in.mDesc.GetLengths()[1],
-                                   in.mDesc.GetLengths()[2],
-                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_nhwc,
-                                   in.mDesc.GetLengths()[0],
-                                   in.mDesc.GetLengths()[1],
-                                   in.mDesc.GetLengths()[2],
-                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-#if USE_MODE
-    // dynamic mode
-    if(argc != 22)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
-    const bool do_verification      = std::stoi(argv[3]);
-    const int init_method           = std::stoi(argv[4]);
-    const bool do_log               = std::stoi(argv[5]);
-    const int nrepeat               = std::stoi(argv[6]);
-    const index_t N  = std::stoi(argv[7]);
-    const index_t K  = std::stoi(argv[8]);
-    const index_t C  = std::stoi(argv[9]);
-    const index_t Y  = std::stoi(argv[10]);
-    const index_t X  = std::stoi(argv[11]);
-    const index_t Hi = std::stoi(argv[12]);
-    const index_t Wi = std::stoi(argv[13]);
-    const index_t conv_stride_h   = std::stoi(argv[14]);
-    const index_t conv_stride_w   = std::stoi(argv[15]);
-    const index_t conv_dilation_h = std::stoi(argv[16]);
-    const index_t conv_dilation_w = std::stoi(argv[17]);
-    const index_t in_left_pad_h   = std::stoi(argv[18]);
-    const index_t in_left_pad_w   = std::stoi(argv[19]);
-    const index_t in_right_pad_h  = std::stoi(argv[20]);
-    const index_t in_right_pad_w  = std::stoi(argv[21]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#else
-    // static mode
-    if(argc < 7)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
-    const bool do_verification      = std::stoi(argv[3]);
-    const int init_method           = std::stoi(argv[4]);
-    const bool do_log               = std::stoi(argv[5]);
-    const int nrepeat               = std::stoi(argv[6]);
-    constexpr auto N  = Number<128>{};
-    constexpr auto C  = Number<192>{};
-    constexpr auto Hi = Number<71>{};
-    constexpr auto Wi = Number<71>{};
-    constexpr auto K  = Number<256>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto conv_stride_h   = I2;
-    constexpr auto conv_stride_w   = I2;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-#endif
-#if 0
-    using in_data_t                  = float;
-    using acc_data_t                 = float;
-    using out_data_t                 = float;
-#elif 1
-    using in_data_t   = half_t;
-    using acc_data_t  = float;
-    using out_data_t  = half_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(C);
-        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
-        in_lengths_host[3]  = static_cast<std::size_t>(C);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not implemented");
-    }
-    Tensor<in_data_t> in_host(in_lengths_host);
-    Tensor<in_data_t> in_device(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out(out_lengths_host);
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in_host.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        break;
-    case 2:
-        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        break;
-    case 3:
-        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        break;
-    case 4:
-        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        break;
-    case 5:
-        out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    auto f_make_for_device_nhwc = [&]() {
-#if USE_MODE
-        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-#if USE_CONV_BWD_V4R1_XDL_NHWC
-    if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                  acc_data_t,
-                                                                                  out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in_device,
-            wei,
-            out,
-            nrepeat);
-    }
-#endif
-#if USE_CONV_BWD_V4R1R2_XDL_NHWC
-    if(algo == ConvBackwardDataAlgo::V4R1R2XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        if(Y == 1 && X == 1 && in_left_pad_h == 0 && in_left_pad_w == 0 && in_right_pad_h == 0 &&
-           in_right_pad_w == 0)
-        {
-            device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1<
-                in_data_t,
-                acc_data_t,
-                out_data_t>(tmp[I0],
-                            tmp[I1],
-                            tmp[I2],
-                            tmp[I3],
-                            tmp[I4],
-                            tmp[I5],
-                            tmp[I6],
-                            in_device,
-                            wei,
-                            out,
-                            nrepeat);
-        }
-        else
-        {
-#if 1
-            device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                        acc_data_t,
-                                                                                        out_data_t>(
-                tmp[I0],
-                tmp[I1],
-                tmp[I2],
-                tmp[I3],
-                tmp[I4],
-                tmp[I5],
-                tmp[I6],
-                in_device,
-                wei,
-                out,
-                nrepeat);
-#endif
-        }
-    }
-#endif
-    if(do_verification)
-    {
-        host_convolution_backward_data(in_host,
-                                       wei,
-                                       out,
-                                       make_tuple(conv_stride_h, conv_stride_w),
-                                       make_tuple(conv_dilation_h, conv_dilation_w),
-                                       make_tuple(in_left_pad_h, in_left_pad_w),
-                                       make_tuple(in_right_pad_h, in_right_pad_w),
-                                       layout);
-        ck::utils::check_err(in_device.mData, in_host.mData);
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "in_host  : ", in_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "in_device: ", in_device.mData, ",") << std::endl;
-        }
-    }
-}
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
-#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 0
-#define USE_CONV_FWD_V4R4R2_NHWC 0
-#define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
-enum ConvTensorLayout
-{
-    NCHW,
-    NHWC,
-    CHWN,
-    NCHWc,
-    NHWCc
-};
-enum ConvForwardAlgo
-{
-    V4R4NCHW,      // 0
-    V4R4R2NHWC,    // 1
-    V6R1NCHW,      // 2
-    V4R4R2XDLNCHW, // 3
-    V4R4R4XDLNHWC  // 4
-};
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_convolution_forward(const Tensor<TIn>& in,
-                              const Tensor<TWei>& wei,
-                              Tensor<TOut>& out,
-                              const ConvStrides& conv_strides,
-                              const ConvDilations& conv_dilations,
-                              const InLeftPads& in_left_pads,
-                              const InRightPads&,
-                              const ConvTensorLayout layout = ConvTensorLayout::NCHW)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        if constexpr(is_same<TIn, bhalf_t>::value)
-                        {
-                            v += ck::type_convert<float>(in(n, c, hi, wi)) *
-                                 ck::type_convert<float>(wei(k, c, y, x));
-                        }
-                        else
-                        {
-                            v += static_cast<const double>(in(n, c, hi, wi)) *
-                                 static_cast<const double>(wei(k, c, y, x));
-                        }
-                    }
-                }
-            }
-        }
-        if constexpr(is_same<TOut, bhalf_t>::value)
-        {
-            out(n, k, ho, wo) = ck::type_convert<bhalf_t>(static_cast<float>(v));
-        }
-        else
-        {
-            out(n, k, ho, wo) = v;
-        }
-    };
-    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[2])
-                    {
-                        if constexpr(is_same<TIn, bhalf_t>::value)
-                        {
-                            v += ck::type_convert<float>(in(n, hi, wi, c)) *
-                                 ck::type_convert<float>(wei(k, y, x, c));
-                        }
-                        else
-                        {
-                            v += static_cast<const double>(in(n, hi, wi, c)) *
-                                 static_cast<const double>(wei(k, y, x, c));
-                        }
-                    }
-                }
-            }
-        }
-        if constexpr(is_same<TOut, bhalf_t>::value)
-        {
-            out(n, ho, wo, k) = ck::type_convert<bhalf_t>(static_cast<float>(v));
-        }
-        else
-        {
-            out(n, ho, wo, k) = v;
-        }
-    };
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_nhwc,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-#if USE_DYNAMIC_MODE
-    // dynamic mode
-    if(argc != 22)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = std::stoi(argv[6]);
-    const index_t N  = std::stoi(argv[7]);
-    const index_t K  = std::stoi(argv[8]);
-    const index_t C  = std::stoi(argv[9]);
-    const index_t Y  = std::stoi(argv[10]);
-    const index_t X  = std::stoi(argv[11]);
-    const index_t Hi = std::stoi(argv[12]);
-    const index_t Wi = std::stoi(argv[13]);
-    const index_t conv_stride_h   = std::stoi(argv[14]);
-    const index_t conv_stride_w   = std::stoi(argv[15]);
-    const index_t conv_dilation_h = std::stoi(argv[16]);
-    const index_t conv_dilation_w = std::stoi(argv[17]);
-    const index_t in_left_pad_h   = std::stoi(argv[18]);
-    const index_t in_left_pad_w   = std::stoi(argv[19]);
-    const index_t in_right_pad_h  = std::stoi(argv[20]);
-    const index_t in_right_pad_w  = std::stoi(argv[21]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#else
-    // static mode
-    if(argc < 7)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = std::stoi(argv[6]);
-    constexpr auto N  = Number<128>{};
-    constexpr auto C  = Number<192>{};
-    constexpr auto Hi = Number<71>{};
-    constexpr auto Wi = Number<71>{};
-    constexpr auto K  = Number<256>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I1;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-#endif
-#if 1
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 1
-    using in_data_t   = half_t;
-    using acc_data_t  = float;
-    using out_data_t  = half_t;
-#elif 0
-    using in_data_t  = bhalf_t;
-    using acc_data_t = float;
-    using out_data_t = bhalf_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(C);
-        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
-        in_lengths_host[3]  = static_cast<std::size_t>(C);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-    }
-    else
-    {
-        std::runtime_error("wrong! not implemented");
-    }
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-#if USE_CONV_FWD_V4R4_NCHW
-    if(algo == ConvForwardAlgo::V4R4NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                           acc_data_t,
-                                                                           out_data_t>(tmp[I0],
-                                                                                       tmp[I1],
-                                                                                       tmp[I2],
-                                                                                       tmp[I3],
-                                                                                       tmp[I4],
-                                                                                       tmp[I5],
-                                                                                       tmp[I6],
-                                                                                       in,
-                                                                                       wei,
-                                                                                       out_device,
-                                                                                       nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4R2_NHWC
-    if(algo == ConvForwardAlgo::V4R4R2NHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                             acc_data_t,
-                                                                             out_data_t>(tmp[I0],
-                                                                                         tmp[I1],
-                                                                                         tmp[I2],
-                                                                                         tmp[I3],
-                                                                                         tmp[I4],
-                                                                                         tmp[I5],
-                                                                                         tmp[I6],
-                                                                                         in,
-                                                                                         wei,
-                                                                                         out_device,
-                                                                                         nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V6R1_NCHW
-    if(algo == ConvForwardAlgo::V6R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                           acc_data_t,
-                                                                           out_data_t>(tmp[I0],
-                                                                                       tmp[I1],
-                                                                                       tmp[I2],
-                                                                                       tmp[I3],
-                                                                                       tmp[I4],
-                                                                                       tmp[I5],
-                                                                                       tmp[I6],
-                                                                                       in,
-                                                                                       wei,
-                                                                                       out_device,
-                                                                                       nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4R2_XDL_NCHW
-    if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                              acc_data_t,
-                                                                              out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4R4_XDL_NHWC
-    if(algo == ConvForwardAlgo::V4R4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                              acc_data_t,
-                                                                              out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_convolution_forward(in,
-                                 wei,
-                                 out_host,
-                                 make_tuple(conv_stride_h, conv_stride_w),
-                                 make_tuple(conv_dilation_h, conv_dilation_w),
-                                 make_tuple(in_left_pad_h, in_left_pad_w),
-                                 make_tuple(in_right_pad_h, in_right_pad_w),
-                                 layout);
-        ck::utils::check_err(out_device.mData, out_host.mData);
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-    }
-}
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
-#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V5R1_NCHWC 1
-enum ConvForwardAlgo
-{
-    V5R1NCHWC // 0
-};
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_direct_convolution_nchwc(const Tensor<TIn>& in,
-                                   const Tensor<TWei>& wei,
-                                   const Tensor<TOut>& bias,
-                                   Tensor<TOut>& out,
-                                   const ConvStrides& conv_strides,
-                                   const ConvDilations& conv_dilations,
-                                   const InLeftPads& in_left_pads,
-                                   const InRightPads&,
-                                   const ck::ActivTypeEnum activ_type)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
-        double v    = 0;
-        const int k = k0 * out.mDesc.GetLengths()[4] + k1;
-        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
-                        {
-                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
-                                 static_cast<const double>(wei(k, c0, y, x, c1));
-                        }
-                    }
-                }
-            }
-        }
-        v += bias(k0, k1);
-        out(n, k0, ho, wo, k1) = activ(v, activ_type);
-    };
-    make_ParallelTensorFunctor(f_nchw,
-                               out.mDesc.GetLengths()[0],
-                               out.mDesc.GetLengths()[1],
-                               out.mDesc.GetLengths()[2],
-                               out.mDesc.GetLengths()[3],
-                               out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-#if USE_DYNAMIC_MODE
-    // dynamic mode
-    if(argc != 23)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    const index_t N  = std::stoi(argv[6]);
-    const index_t K0 = std::stoi(argv[7]);
-    const index_t K1 = std::stoi(argv[8]);
-    const index_t C0 = std::stoi(argv[9]);
-    const index_t C1 = std::stoi(argv[10]);
-    const index_t Y  = std::stoi(argv[11]);
-    const index_t X  = std::stoi(argv[12]);
-    const index_t Hi = std::stoi(argv[13]);
-    const index_t Wi = std::stoi(argv[14]);
-    const index_t conv_stride_h   = std::stoi(argv[15]);
-    const index_t conv_stride_w   = std::stoi(argv[16]);
-    const index_t conv_dilation_h = std::stoi(argv[17]);
-    const index_t conv_dilation_w = std::stoi(argv[18]);
-    const index_t in_left_pad_h   = std::stoi(argv[19]);
-    const index_t in_left_pad_w   = std::stoi(argv[20]);
-    const index_t in_right_pad_h  = std::stoi(argv[21]);
-    const index_t in_right_pad_w  = std::stoi(argv[22]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#else
-    // static mode
-    if(argc < 6)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    // constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-#if 0
-    constexpr auto N              = Number<1>{};
-    constexpr auto Hi             = Number<1080>{};
-    constexpr auto Wi             = Number<1920>{};
-    constexpr auto Y              = Number<3>{};
-    constexpr auto X              = Number<3>{};
-    constexpr auto C0             = Number<2>{};
-    constexpr auto C1             = Number<8>{};
-    constexpr auto K0             = Number<1>{};
-    constexpr auto K1             = Number<4>{};
-#elif 1
-    constexpr auto N              = Number<1>{};
-    constexpr auto Hi             = Number<1080>{};
-    constexpr auto Wi             = Number<1920>{};
-    constexpr auto Y              = Number<3>{};
-    constexpr auto X              = Number<3>{};
-    constexpr auto C0             = Number<2>{};
-    constexpr auto C1             = Number<8>{};
-    constexpr auto K0             = Number<2>{};
-    constexpr auto K1             = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<1080>{};
-    constexpr auto Wi = Number<1920>{};
-    constexpr auto Y  = Number<1>{};
-    constexpr auto X  = Number<1>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<540>{};
-    constexpr auto Wi = Number<960>{};
-    constexpr auto Y  = Number<1>{};
-    constexpr auto X  = Number<1>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<128>{};
-    constexpr auto Hi = Number<270>{};
-    constexpr auto Wi = Number<480>{};
-    constexpr auto Y  = Number<1>{};
-    constexpr auto X  = Number<1>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#endif
-    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I1;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-#if 1
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-#else
-    constexpr auto in_left_pad_h  = I0;
-    constexpr auto in_left_pad_w  = I0;
-    constexpr auto in_right_pad_h = I0;
-    constexpr auto in_right_pad_w = I0;
-#endif
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-#endif
-#if 0
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 1
-    using in_data_t   = half_t;
-    using acc_data_t  = float;
-    using out_data_t  = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
-        bias_lengths_host(2);
-    in_lengths_host[0] = static_cast<std::size_t>(N);
-    in_lengths_host[1] = static_cast<std::size_t>(C0);
-    in_lengths_host[2] = static_cast<std::size_t>(Hi);
-    in_lengths_host[3] = static_cast<std::size_t>(Wi);
-    in_lengths_host[4] = static_cast<std::size_t>(C1);
-    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
-    wei_lengths_host[1] = static_cast<std::size_t>(C0);
-    wei_lengths_host[2] = static_cast<std::size_t>(Y);
-    wei_lengths_host[3] = static_cast<std::size_t>(X);
-    wei_lengths_host[4] = static_cast<std::size_t>(C1);
-    out_lengths_host[0] = static_cast<std::size_t>(N);
-    out_lengths_host[1] = static_cast<std::size_t>(K0);
-    out_lengths_host[2] = static_cast<std::size_t>(Ho);
-    out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    out_lengths_host[4] = static_cast<std::size_t>(K1);
-    bias_lengths_host[0] = static_cast<std::size_t>(K0);
-    bias_lengths_host[1] = static_cast<std::size_t>(K1);
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> bias(bias_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    auto f_make_for_device_nchwc = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
-        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
-        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-#if USE_CONV_FWD_V5R1_NCHWC
-    if(algo == ConvForwardAlgo::V5R1NCHWC)
-    {
-        const auto tmp = f_make_for_device_nchwc();
-        device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
-                                                                                    acc_data_t,
-                                                                                    out_data_t,
-                                                                                    activ_type>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            bias,
-            out_device,
-            nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_direct_convolution_nchwc(in,
-                                      wei,
-                                      bias,
-                                      out_host,
-                                      make_tuple(conv_stride_h, conv_stride_w),
-                                      make_tuple(conv_dilation_h, conv_dilation_w),
-                                      make_tuple(in_left_pad_h, in_left_pad_w),
-                                      make_tuple(in_right_pad_h, in_right_pad_w),
-                                      activ_type);
-        ck::utils::check_err(out_device.mData, out_host.mData);
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-    }
-}
--- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
-#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V5R1_NCHWC 1
-enum ConvForwardAlgo
-{
-    V5R1NCHWC // 0
-};
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
-                                           const Tensor<TWei>& wei,
-                                           const Tensor<TOut>& bias,
-                                           Tensor<TOut>& out_host,
-                                           Tensor<TOut>& max_host,
-                                           const ConvStrides& conv_strides,
-                                           const ConvDilations& conv_dilations,
-                                           const InLeftPads& in_left_pads,
-                                           const InRightPads&,
-                                           const ck::ActivTypeEnum activ_type)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
-        double v = 0;
-        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
-        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
-                        {
-                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
-                                 static_cast<const double>(wei(k, c0, y, x, c1));
-                        }
-                    }
-                }
-            }
-        }
-        v += bias(k0, k1);
-        v = activ(v, activ_type);
-        out_host(n, k0, ho, wo, k1) = v;
-    };
-    make_ParallelTensorFunctor(f_nchw,
-                               out_host.mDesc.GetLengths()[0],
-                               out_host.mDesc.GetLengths()[1],
-                               out_host.mDesc.GetLengths()[2],
-                               out_host.mDesc.GetLengths()[3],
-                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
-    auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
-        auto hx = ho * 2;
-        auto wx = wo * 2;
-        auto v0 = out_host(n, k0, hx, wx, k1);
-        auto v1 = out_host(n, k0, hx, wx + 1, k1);
-        auto v2 = out_host(n, k0, hx + 1, wx, k1);
-        auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
-        max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
-    };
-    make_ParallelTensorFunctor(maxpool_nchw,
-                               max_host.mDesc.GetLengths()[0],
-                               max_host.mDesc.GetLengths()[1],
-                               max_host.mDesc.GetLengths()[2],
-                               max_host.mDesc.GetLengths()[3],
-                               max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-#if USE_DYNAMIC_MODE
-    // dynamic mode
-    if(argc != 23)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    const index_t N  = std::stoi(argv[6]);
-    const index_t K0 = std::stoi(argv[7]);
-    const index_t K1 = std::stoi(argv[8]);
-    const index_t C0 = std::stoi(argv[9]);
-    const index_t C1 = std::stoi(argv[10]);
-    const index_t Y  = std::stoi(argv[11]);
-    const index_t X  = std::stoi(argv[12]);
-    const index_t Hi = std::stoi(argv[13]);
-    const index_t Wi = std::stoi(argv[14]);
-    const index_t conv_stride_h   = std::stoi(argv[15]);
-    const index_t conv_stride_w   = std::stoi(argv[16]);
-    const index_t conv_dilation_h = std::stoi(argv[17]);
-    const index_t conv_dilation_w = std::stoi(argv[18]);
-    const index_t in_left_pad_h   = std::stoi(argv[19]);
-    const index_t in_left_pad_w   = std::stoi(argv[20]);
-    const index_t in_right_pad_h  = std::stoi(argv[21]);
-    const index_t in_right_pad_w  = std::stoi(argv[22]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-    const index_t Ho_2 = Ho / 2;
-    const index_t Wo_2 = Wo / 2;
-#else
-    // static mode
-    if(argc < 6)
-    {
-        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
-    const bool do_verification = std::stoi(argv[2]);
-    const int init_method      = std::stoi(argv[3]);
-    const bool do_log          = std::stoi(argv[4]);
-    const int nrepeat          = std::stoi(argv[5]);
-    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
-#if 1
-    constexpr auto N                       = Number<1>{};
-    constexpr auto Hi                      = Number<1080>{};
-    constexpr auto Wi                      = Number<1920>{};
-    constexpr auto Y                       = Number<3>{};
-    constexpr auto X                       = Number<3>{};
-    constexpr auto C0                      = Number<2>{};
-    constexpr auto C1                      = Number<8>{};
-    constexpr auto K0                      = Number<2>{};
-    constexpr auto K1                      = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<1080>{};
-    constexpr auto Wi = Number<1920>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<3>{};
-    constexpr auto C1 = Number<4>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<1>{};
-    constexpr auto Hi = Number<540>{};
-    constexpr auto Wi = Number<960>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#elif 0
-    constexpr auto N  = Number<128>{};
-    constexpr auto Hi = Number<270>{};
-    constexpr auto Wi = Number<480>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto C0 = Number<2>{};
-    constexpr auto C1 = Number<8>{};
-    constexpr auto K0 = Number<2>{};
-    constexpr auto K1 = Number<8>{};
-#endif
-    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I1;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-    constexpr auto Ho_2 = Number<Ho / 2>{};
-    constexpr auto Wo_2 = Number<Wo / 2>{};
-#endif
-#if 0
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 1
-    using in_data_t     = half_t;
-    using acc_data_t    = float;
-    using out_data_t    = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
-        max_lengths_host(5), bias_lengths_host(2);
-    in_lengths_host[0] = static_cast<std::size_t>(N);
-    in_lengths_host[1] = static_cast<std::size_t>(C0);
-    in_lengths_host[2] = static_cast<std::size_t>(Hi);
-    in_lengths_host[3] = static_cast<std::size_t>(Wi);
-    in_lengths_host[4] = static_cast<std::size_t>(C1);
-    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
-    wei_lengths_host[1] = static_cast<std::size_t>(C0);
-    wei_lengths_host[2] = static_cast<std::size_t>(Y);
-    wei_lengths_host[3] = static_cast<std::size_t>(X);
-    wei_lengths_host[4] = static_cast<std::size_t>(C1);
-    out_lengths_host[0] = static_cast<std::size_t>(N);
-    out_lengths_host[1] = static_cast<std::size_t>(K0);
-    out_lengths_host[2] = static_cast<std::size_t>(Ho);
-    out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    out_lengths_host[4] = static_cast<std::size_t>(K1);
-    max_lengths_host[0] = static_cast<std::size_t>(N);
-    max_lengths_host[1] = static_cast<std::size_t>(K0);
-    max_lengths_host[2] = static_cast<std::size_t>(Ho_2);
-    max_lengths_host[3] = static_cast<std::size_t>(Wo_2);
-    max_lengths_host[4] = static_cast<std::size_t>(K1);
-    bias_lengths_host[0] = static_cast<std::size_t>(K0);
-    bias_lengths_host[1] = static_cast<std::size_t>(K1);
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> bias(bias_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<in_data_t> max_device(max_lengths_host);
-    Tensor<in_data_t> max_host(max_lengths_host);
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-    auto f_make_for_device_nchwc = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
-        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
-        const auto max_lengths_dev    = make_tuple(N, K0, Ho_2, Wo_2, K1);
-        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          max_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-#if USE_CONV_FWD_V5R1_NCHWC
-    if(algo == ConvForwardAlgo::V5R1NCHWC)
-    {
-        const auto tmp = f_make_for_device_nchwc();
-        device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<
-            in_data_t,
-            acc_data_t,
-            out_data_t,
-            activ_type>(tmp[I0], // in_lengths_dev
-                        tmp[I1], // wei_lengths_dev
-                        tmp[I2], // max_lengths_dev
-                        tmp[I3], // out_lengths_dev
-                        tmp[I4], // conv_strides_dev
-                        tmp[I5], // conv_dilations_dev
-                        tmp[I6], // in_left_pads_dev
-                        tmp[I7], // in_right_pads_dev
-                        in,
-                        wei,
-                        bias,
-                        out_device,
-                        max_device,
-                        nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_direct_convolution_maxpool_nchwc(in,
-                                              wei,
-                                              bias,
-                                              out_host,
-                                              max_host,
-                                              make_tuple(conv_stride_h, conv_stride_w),
-                                              make_tuple(conv_dilation_h, conv_dilation_w),
-                                              make_tuple(in_left_pad_h, in_left_pad_w),
-                                              make_tuple(in_right_pad_h, in_right_pad_w),
-                                              activ_type);
-        ck::utils::check_err(out_device.mData, out_host.mData);
-        ck::utils::check_err(max_device.mData, max_host.mData);
-        if(do_log)
-        {
-            // LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            // LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            // LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
-            // std::endl;
-            LogRangeAsType<float>(std::cout << "max_host: ", max_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "max_device: ", max_device.mData, ",") << std::endl;
-        }
-    }
-}
--- a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "debug.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "device_tensor.hpp"
-#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
-#include "device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk.hpp"
-enum ConvTensorLayout
-{
-    NCHW,
-    NHWC,
-    CHWN,
-    NCHWc,
-    NHWCc
-};
-#define USE_DYNAMIC_MODE 1
-#define USE_CONV_WRW_V4R4R2_XDL_NCHW 0
-#define USE_CONV_WRW_V4R4R4_XDL_NHWC 0
-#define USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW 0
-#define USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC 0
-#define USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC 1
-enum ConvBackwardWeightAlgo
-{
-    V4R4R2XDLNCHW,       // 0
-    V4R4R4XDLNHWC,       // 1
-    V4R4R2XDLATOMICNCHW, // 2
-    V4R4R4XDLATOMICNHWC, // 3
-    V4R4R5XDLATOMICNHWC, // 4
-};
-template <typename TOut,
-          typename TIn,
-          typename TWei,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void host_convolution_backward_weight(const Tensor<TOut>& out,
-                                      const Tensor<TIn>& in,
-                                      Tensor<TWei>& wei,
-                                      const ConvStrides& conv_strides,
-                                      const ConvDilations& conv_dilations,
-                                      const InLeftPads& in_left_pads,
-                                      const InRightPads&,
-                                      const ConvTensorLayout layout = ConvTensorLayout::NCHW)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
-        double v = 0;
-        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-        {
-            for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        v += static_cast<const double>(in(n, c, hi, wi)) *
-                             static_cast<const double>(out(n, k, ho, wo));
-                    }
-                }
-            }
-        }
-        wei(k, c, y, x) = v;
-    };
-    auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
-        double v = 0;
-        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-        {
-            for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
-            {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
-                {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[2])
-                    {
-                        v += static_cast<const double>(in(n, hi, wi, c)) *
-                             static_cast<const double>(out(n, ho, wo, k));
-                    }
-                }
-            }
-        }
-        wei(k, y, x, c) = v;
-    };
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        make_ParallelTensorFunctor(f_kcyx,
-                                   wei.mDesc.GetLengths()[0],
-                                   wei.mDesc.GetLengths()[1],
-                                   wei.mDesc.GetLengths()[2],
-                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        make_ParallelTensorFunctor(f_kyxc,
-                                   wei.mDesc.GetLengths()[0],
-                                   wei.mDesc.GetLengths()[1],
-                                   wei.mDesc.GetLengths()[2],
-                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-#if USE_DYNAMIC_MODE
-    // dynamic mode
-    if(argc != 23)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        printf("additional: desired_grid_size\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout     = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
-    const bool do_verification        = std::stoi(argv[3]);
-    const int init_method             = std::stoi(argv[4]);
-    const bool do_log                 = std::stoi(argv[5]);
-    const int nrepeat                 = std::stoi(argv[6]);
-    const index_t N  = std::stoi(argv[7]);
-    const index_t K  = std::stoi(argv[8]);
-    const index_t C  = std::stoi(argv[9]);
-    const index_t Y  = std::stoi(argv[10]);
-    const index_t X  = std::stoi(argv[11]);
-    const index_t Hi = std::stoi(argv[12]);
-    const index_t Wi = std::stoi(argv[13]);
-    const index_t conv_stride_h   = std::stoi(argv[14]);
-    const index_t conv_stride_w   = std::stoi(argv[15]);
-    const index_t conv_dilation_h = std::stoi(argv[16]);
-    const index_t conv_dilation_w = std::stoi(argv[17]);
-    const index_t in_left_pad_h   = std::stoi(argv[18]);
-    const index_t in_left_pad_w   = std::stoi(argv[19]);
-    const index_t in_right_pad_h  = std::stoi(argv[20]);
-    const index_t in_right_pad_w  = std::stoi(argv[21]);
-    const index_t desired_grid_size = std::stoi(argv[22]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#else
-    // static mode
-    if(argc < 7)
-    {
-        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout     = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
-    const bool do_verification        = std::stoi(argv[3]);
-    const int init_method             = std::stoi(argv[4]);
-    const bool do_log                 = std::stoi(argv[5]);
-    const int nrepeat                 = std::stoi(argv[6]);
-    constexpr auto N  = Number<128>{};
-    constexpr auto C  = Number<128>{};
-    constexpr auto Hi = Number<14>{};
-    constexpr auto Wi = Number<14>{};
-    constexpr auto K  = Number<256>{};
-    constexpr auto Y  = Number<3>{};
-    constexpr auto X  = Number<3>{};
-    constexpr auto conv_stride_h   = I1;
-    constexpr auto conv_stride_w   = I1;
-    constexpr auto conv_dilation_h = I1;
-    constexpr auto conv_dilation_w = I1;
-    constexpr auto in_left_pad_h   = I1;
-    constexpr auto in_left_pad_w   = I1;
-    constexpr auto in_right_pad_h  = I1;
-    constexpr auto in_right_pad_w  = I1;
-    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
-    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
-    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
-    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
-#endif
-#if 0
-    using in_data_t  = float;
-    using wei_data_t = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 1
-    using in_data_t   = half_t;
-    using out_data_t  = half_t;
-    using acc_data_t  = float;
-    using wei_data_t  = float;
-#elif 1
-    using in_data_t  = int8_t;
-    using out_data_t = int8_t;
-    using acc_data_t = int32_t;
-    using wei_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    if(layout == ConvTensorLayout::NCHW)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(C);
-        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-    }
-    else if(layout == ConvTensorLayout::NHWC)
-    {
-        in_lengths_host[0]  = static_cast<std::size_t>(N);
-        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
-        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
-        in_lengths_host[3]  = static_cast<std::size_t>(C);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-    }
-    else
-    {
-        std::runtime_error("wrong! not implemented");
-    }
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<wei_data_t> wei_device(wei_lengths_host);
-    Tensor<wei_data_t> wei_host(wei_lengths_host);
-    Tensor<out_data_t> out(out_lengths_host);
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei_host.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.1, 0.1}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{-0.1, 0.1}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);
-        auto gen_out = [](auto... is) {
-            return GeneratorTensor_2<out_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        out.GenerateTensorValue(gen_out, num_thread);
-    }
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-    // set zero to wei_device
-    wei_device.GenerateTensorValue(GeneratorTensor_0{}, num_thread);
-#if USE_CONV_WRW_V4R4R2_XDL_NCHW
-    if(algo == ConvBackwardWeightAlgo::V4R4R2XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                      wei_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei_device,
-            out,
-            nrepeat);
-    }
-#endif
-#if USE_CONV_WRW_V4R4R4_XDL_NHWC
-    if(algo == ConvBackwardWeightAlgo::V4R4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                      wei_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei_device,
-            out,
-            nrepeat);
-    }
-#endif
-#if USE_CONV_WRW_V4R4R2_XDL_ATOMIC_NCHW
-    if(algo == ConvBackwardWeightAlgo::V4R4R2XDLATOMICNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw<
-            in_data_t,
-            wei_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in,
-                        wei_device,
-                        out,
-                        desired_grid_size,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_WRW_V4R4R4_XDL_ATOMIC_NHWC
-    if(algo == ConvBackwardWeightAlgo::V4R4R4XDLATOMICNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk<
-            in_data_t,
-            wei_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in,
-                        wei_device,
-                        out,
-                        desired_grid_size,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_WRW_V4R4R5_XDL_ATOMIC_NHWC
-    if(algo == ConvBackwardWeightAlgo::V4R4R5XDLATOMICNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk<
-            in_data_t,
-            wei_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in,
-                        wei_device,
-                        out,
-                        desired_grid_size,
-                        nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_convolution_backward_weight(out,
-                                         in,
-                                         wei_host,
-                                         make_tuple(conv_stride_h, conv_stride_w),
-                                         make_tuple(conv_dilation_h, conv_dilation_w),
-                                         make_tuple(in_left_pad_h, in_left_pad_w),
-                                         make_tuple(in_right_pad_h, in_right_pad_w),
-                                         layout);
-        ck::utils::check_err(wei_device.mData, wei_host.mData);
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei_device: ", wei_device.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
-        }
-    }
-}