Merge pull request #32 from ROCmSoftwarePlatform/develop

Merge develop into master

Merge pull request #32 from ROCmSoftwarePlatform/develop
Merge develop into master
88833bd9 · Chao Liu · GitHub · 31b40352 · f3acd251 · 88833bd9
Unverified Commit 88833bd9 authored Sep 21, 2021 by Chao Liu Committed by GitHub Sep 21, 2021
6 changed files
--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "gemm_common.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdlops_mk_kn_mn.hpp"
+#include "device_gemm_xdlops_mk_nk_mn.hpp"
+#include "device_gemm_xdlops_km_kn_mn.hpp"
+#include "device_gemm_xdlops_km_nk_mn.hpp"
+
+#define USE_GEMM_XDL_MK_KN_MN 1
+#define USE_GEMM_XDL_MK_NK_MN 1
+#define USE_GEMM_XDL_KM_KN_MN 1
+#define USE_GEMM_XDL_KM_NK_MN 1
+
+enum GemmAlgo
+{
+    Xdl_MK_KN_MN, // 0
+    Xdl_MK_NK_MN, // 1
+    Xdl_KM_KN_MN, // 2
+    Xdl_KM_NK_MN, // 3
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    // dynamic mode
+    if(argc != 10)
+    {
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: M, N, K\n");
+        exit(1);
+    }
+
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
+    const auto algo            = static_cast<GemmAlgo>(std::stoi(argv[2]));
+    const bool do_verification = std::stoi(argv[3]);
+    const int init_method      = std::stoi(argv[4]);
+    const bool do_log          = std::stoi(argv[5]);
+    const int nrepeat          = std::stoi(argv[6]);
+
+    const index_t M = std::stoi(argv[7]);
+    const index_t N = std::stoi(argv[8]);
+    const index_t K = std::stoi(argv[9]);
+
+#if 0
+    using ab_data_t  = float;
+    using acc_data_t = float;
+    using c_data_t   = float;
+#elif 1
+    using ab_data_t  = half_t;
+    using acc_data_t = float;
+    using c_data_t   = half_t;
+#elif 1
+    using ab_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using c_data_t   = int8_t;
+#endif
+
+    std::vector<std::size_t> a_lengths_host(2), b_lengths_host(2), c_lengths_host(2);
+    std::vector<std::size_t> a_strides_host(2), b_strides_host(2), c_strides_host(2);
+
+    if(layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        a_lengths_host[0] = static_cast<std::size_t>(M);
+        a_lengths_host[1] = static_cast<std::size_t>(K);
+        a_strides_host[0] = static_cast<std::size_t>(K);
+        a_strides_host[1] = static_cast<std::size_t>(1);
+
+        b_lengths_host[0] = static_cast<std::size_t>(K);
+        b_lengths_host[1] = static_cast<std::size_t>(N);
+        b_strides_host[0] = static_cast<std::size_t>(N);
+        b_strides_host[1] = static_cast<std::size_t>(1);
+
+        c_lengths_host[0] = static_cast<std::size_t>(M);
+        c_lengths_host[1] = static_cast<std::size_t>(N);
+        c_strides_host[0] = static_cast<std::size_t>(N);
+        c_strides_host[1] = static_cast<std::size_t>(1);
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        a_lengths_host[0] = static_cast<std::size_t>(M);
+        a_lengths_host[1] = static_cast<std::size_t>(K);
+        a_strides_host[0] = static_cast<std::size_t>(K);
+        a_strides_host[1] = static_cast<std::size_t>(1);
+
+        b_lengths_host[0] = static_cast<std::size_t>(N);
+        b_lengths_host[1] = static_cast<std::size_t>(K);
+        b_strides_host[0] = static_cast<std::size_t>(K);
+        b_strides_host[1] = static_cast<std::size_t>(1);
+
+        c_lengths_host[0] = static_cast<std::size_t>(M);
+        c_lengths_host[1] = static_cast<std::size_t>(N);
+        c_strides_host[0] = static_cast<std::size_t>(N);
+        c_strides_host[1] = static_cast<std::size_t>(1);
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        a_lengths_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[1] = static_cast<std::size_t>(M);
+        a_strides_host[0] = static_cast<std::size_t>(M);
+        a_strides_host[1] = static_cast<std::size_t>(1);
+
+        b_lengths_host[0] = static_cast<std::size_t>(K);
+        b_lengths_host[1] = static_cast<std::size_t>(N);
+        b_strides_host[0] = static_cast<std::size_t>(N);
+        b_strides_host[1] = static_cast<std::size_t>(1);
+
+        c_lengths_host[0] = static_cast<std::size_t>(M);
+        c_lengths_host[1] = static_cast<std::size_t>(N);
+        c_strides_host[0] = static_cast<std::size_t>(N);
+        c_strides_host[1] = static_cast<std::size_t>(1);
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        a_lengths_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[1] = static_cast<std::size_t>(M);
+        a_strides_host[0] = static_cast<std::size_t>(M);
+        a_strides_host[1] = static_cast<std::size_t>(1);
+
+        b_lengths_host[0] = static_cast<std::size_t>(N);
+        b_lengths_host[1] = static_cast<std::size_t>(K);
+        b_strides_host[0] = static_cast<std::size_t>(K);
+        b_strides_host[1] = static_cast<std::size_t>(1);
+
+        c_lengths_host[0] = static_cast<std::size_t>(M);
+        c_lengths_host[1] = static_cast<std::size_t>(N);
+        c_strides_host[0] = static_cast<std::size_t>(N);
+        c_strides_host[1] = static_cast<std::size_t>(1);
+    }
+    else
+    {
+        std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<ab_data_t> a(a_lengths_host, a_strides_host);
+    Tensor<ab_data_t> b(b_lengths_host, b_strides_host);
+    Tensor<c_data_t> c_host(c_lengths_host, c_strides_host);
+    Tensor<c_data_t> c_device(c_lengths_host, c_strides_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(a.mDesc, std::cout << "a: ");
+    ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
+    ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        a.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        a.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        a.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        a.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    default:
+        a.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+    }
+
+    auto f_make_for_device_mk_kn_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
+
+    auto f_make_for_device_mk_nk_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(K, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
+
+    auto f_make_for_device_km_kn_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(K, M), make_tuple(M, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
+
+    auto f_make_for_device_km_nk_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(K, M), make_tuple(M, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(K, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
+
+#if USE_GEMM_XDL_MK_KN_MN
+    if(algo == GemmAlgo::Xdl_MK_KN_MN)
+    {
+        if(layout != GemmMatrixLayout::MK_KN_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto descs = f_make_for_device_mk_kn_mn();
+
+        device_gemm_xdlops_mk_kn_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_MK_NK_MN
+    if(algo == GemmAlgo::Xdl_MK_NK_MN)
+    {
+        if(layout != GemmMatrixLayout::MK_NK_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto descs = f_make_for_device_mk_nk_mn();
+
+        device_gemm_xdlops_mk_nk_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_KM_KN_MN
+    if(algo == GemmAlgo::Xdl_KM_KN_MN)
+    {
+        if(layout != GemmMatrixLayout::KM_KN_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto descs = f_make_for_device_km_kn_mn();
+
+        device_gemm_xdlops_km_kn_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_KM_NK_MN
+    if(algo == GemmAlgo::Xdl_KM_NK_MN)
+    {
+        if(layout != GemmMatrixLayout::KM_NK_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto descs = f_make_for_device_km_nk_mn();
+
+        device_gemm_xdlops_km_nk_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_gemm(a, b, c_host, layout);
+
+        check_error(c_host, c_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "c_host  : ", c_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "c_device: ", c_device.mData, ",") << std::endl;
+        }
+    }
+}
--- a/host/host_tensor/include/gemm_common.hpp
+++ b/host/host_tensor/include/gemm_common.hpp
+#ifndef GEMM_COMMON_HPP
+#define GEMM_COMMON_HPP
+
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+#endif
--- a/host/host_tensor/include/host_conv_bwd_weight.hpp
+++ b/host/host_tensor/include/host_conv_bwd_weight.hpp
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TOut,
+          typename TIn,
+          typename TWei,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_weights(
+    const Tensor<TOut>& out,
+    const Tensor<TIn>& in,
+    Tensor<TWei>& wei,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads&,
+    const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(out(n, k, ho, wo));
+                    }
+                }
+            }
+        }
+        wei(k, c, y, x) = v;
+    };
+
+    auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(out(n, ho, wo, k));
+                    }
+                }
+            }
+        }
+        wei(k, y, x, c) = v;
+    };
+
+    if(layout == ConvTensorLayout::NCHW)
+    {
+        make_ParallelTensorFunctor(f_kcyx,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
+        make_ParallelTensorFunctor(f_kyxc,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
+#pragma once
+#include "host_tensor.hpp"
+#include "gemm_common.hpp"
+
+template <typename AType, typename BType, typename CType>
+void host_gemm(const Tensor<AType>& a,
+               const Tensor<BType>& b,
+               Tensor<CType>& c,
+               const GemmMatrixLayout layout)
+{
+    if(layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        auto f_mk_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        auto f_mk_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        auto f_km_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        auto f_km_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -9,8 +9,8 @@ struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
    int NPerBlock;
    int KPerBlock;

-    int MPerWave;
-    int NPerWave;
+    int MPerXDL;
+    int NPerXDL;
    int K1;

    int MRepeat;
@@ -45,8 +45,8 @@ static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
        128,                      // MPerBlock,
        128,                      // NPerBlock,
        4,                        // KPerBlock,
-        32,                       // MPerWave,
-        32,                       // NPerWave,
+        32,                       // MPerXDL,
+        32,                       // NPerXDL,
        4,                        // K1,
        2,                        // MRepeat,
        2,                        // NRepeat,

--- a/script/run.sh
+++ b/script/run.sh
@@ -12,13 +12,16 @@
 #export OLC_DEBUG_HIP_DUMP=1
 #export OLC_DEBUG_SAVE_TEMP_DIR=1

- make -j conv_fwd_driver_offline
- make -j conv_bwd_driver_offline
- make -j conv_fwd_driver_online
-
 #rm -rf /root/_hip_binary_kernels_/
 #rm -rf /tmp/olCompile*

+#make -j conv_fwd_driver_offline
+#make -j conv_bwd_driver_offline
+#make -j conv_wrw_driver_offline
+#make -j conv_fwd_driver_online
+
+ make -j gemm_driver_offline
+
 LAYOUT=$1
 ALGO=$2
 VERIFY=$3
@@ -30,7 +33,7 @@ REPEAT=$6
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256 1024 1 7  17   17     1 1       1 1      0 3       0 3
- ./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  128 3 3  14   14     1 1       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  512  512 3 3   7    7     1 1       1 1      1 1       1 1

@@ -44,4 +47,12 @@ REPEAT=$6

 #./host/driver_offline/conv_bwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1

-#./host/driver_online/conv_fwd_driver_online    $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+#./host/driver_offline/conv_wrw_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  128 3 3  14   14     1 1       1 1      1 1       1 1
+
+#./host/driver_online/conv_fwd_driver_online    $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1
+
+################################################ layout  algo  verify  init  log  repeat  M___ N___ K___
+#./host/driver_offline/gemm_driver_offline      $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT   960 1024 1024
+#./host/driver_offline/gemm_driver_offline      $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  1920 2048 2048
+ ./host/driver_offline/gemm_driver_offline      $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  3840 4096 4096
+#./host/driver_offline/gemm_driver_offline      $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  7680 8192 8192