tidy up

268d1c71 · Chao Liu · c9fa46af · 268d1c71 · 268d1c71 · 268d1c71
Commit 268d1c71 authored Apr 08, 2019 by Chao Liu
14 changed files
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -52,7 +52,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
        in_nchw_vec(n, c, h, w) =
            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
 #elif 1
-        in_nchw_vec(n, c, h, w)  = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
                                                 in_nchw(n, 4 * c + 1, h, w),
                                                 in_nchw(n, 4 * c + 2, h, w),
                                                 in_nchw(n, 4 * c + 3, h, w));
@@ -114,37 +114,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
    constexpr index_t BlockSize = 128;
 #elif 0
    // 3x3, 34x34, 128 thread, fp32, vector = 2
-    constexpr index_t NPerBlock = 2;
+    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock = 32;
+    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock = 2;
+    constexpr index_t CPerBlock  = 2;
    constexpr index_t HoPerBlock = 2;
    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread = 2;
+    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread = 4;
+    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread = 1;
+    constexpr index_t CPerThread  = 1;
    constexpr index_t HoPerThread = 2;
    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead = 2;
+    constexpr index_t InBlockCopyDataPerRead  = 2;
    constexpr index_t WeiBlockCopyDataPerRead = 2;
    constexpr index_t BlockSize = 128;
 #elif 0
    // 3x3, 34x34, 128 thread, int8, vector = 4
-    constexpr index_t NPerBlock = 2;
+    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock = 32;
+    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock = 8;
+    constexpr index_t CPerBlock  = 8;
    constexpr index_t HoPerBlock = 4;
    constexpr index_t WoPerBlock = 32;
-    constexpr index_t NPerThread = 1;
+    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread = 8;
+    constexpr index_t KPerThread  = 8;
-    constexpr index_t CPerThread = 2;
+    constexpr index_t CPerThread  = 2;
    constexpr index_t HoPerThread = 4;
    constexpr index_t WoPerThread = 2;
-    constexpr index_t InBlockCopyDataPerRead = 2;
+    constexpr index_t InBlockCopyDataPerRead  = 2;
    constexpr index_t WeiBlockCopyDataPerRead = 2;
    constexpr index_t BlockSize = 128;

--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -69,7 +69,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-#if 1
+#if 0
    // 3x3, 34x34
    // need to use register double buffer for GEMM
    constexpr index_t BPerBlock = 128;
@@ -189,7 +189,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t WeiBlockCopyDataPerRead = 4;
    constexpr index_t BlockSize = 256;
-#elif 1
+#elif 0
    // 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
    constexpr index_t BPerBlock = 64;
    constexpr index_t KPerBlock = 128;
@@ -217,7 +217,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
    constexpr index_t OutThreadCopyDataPerWrite = 4;
    constexpr index_t BlockSize = 128;
-#elif 0
+#elif 1
    // 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer
    constexpr index_t BPerBlock = 128;
    constexpr index_t KPerBlock = 128;

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
            std::size_t ho = HoPerTile * htile + j;
            for(int i = 0; i < WoPerTile; ++i)
            {
-                std::size_t wo         = WoPerTile * wtile + i;
+                std::size_t wo = WoPerTile * wtile + i;
                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
            }
        }
@@ -409,7 +409,7 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 1
+#elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 1x1 filter, 14x14 image, C = 2048
    constexpr index_t N  = 128;
    constexpr index_t C  = 2048;
@@ -661,7 +661,7 @@ int main(int argc, char* argv[])
    device_direct_convolution_2_nchw_kcyx_nkhw
 #elif 0
    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
-#elif 1
+#elif 0
    device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 1
    device_implicit_gemm_convolution_2_chwn_cyxk_khwn

--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded
        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
        const Float* p_src_tmp =
-            p_src + src_desc.Get1dIndex(c_block_data_begin,
+            p_src +
-                                        (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
+            src_desc.Get1dIndex(c_block_data_begin,
-                                        (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
-                                        n_block_data_begin);
+                                (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                n_block_data_begin);
 #if 0
        if(get_thread_local_1d_id() == 0)

--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -329,8 +329,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
            {
                threadwise_matrix_copy(
                    c_thread_sub_mtx,
-                    p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                    p_c_thread +
-                                                             n_repeat * NPerLevel1Cluster),
+                        c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                                                    n_repeat * NPerLevel1Cluster),
                    c_block_mtx,
                    p_c_block +
                        c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,

--- a/src/include/blockwise_direct_convolution.hip.hpp
+++ b/src/include/blockwise_direct_convolution.hip.hpp
@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        Float p_out_thread[out_thread_desc.GetElementSpace()];
        threadwise_4d_tensor_copy(out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
+                                  p_out_block +
-                                                                          k_thread_data_begin,
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          ho_thread_data_begin,
+                                                                k_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                  out_thread_desc,
                                  p_out_thread,
                                  out_thread_desc.GetLengths());
@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
            // threadwise convolution
            threadwise_direct_convolution_2(
                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                p_in_block +
-                                                      c_thread_data_begin,
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      hi_thread_data_begin,
+                                             c_thread_data_begin,
-                                                      wi_thread_data_begin),
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block +
                    wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        threadwise_4d_tensor_copy(out_thread_desc,
                                  p_out_thread,
                                  out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
+                                  p_out_block +
-                                                                          k_thread_data_begin,
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          ho_thread_data_begin,
+                                                                k_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                  out_thread_desc.GetLengths());
    }
 }
--- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp
@@ -183,8 +183,9 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
        threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
        const Float* p_in_global_block_begin =
-            p_in_global + in_chwn_global_desc.Get1dIndex(
+            p_in_global +
-                              0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
+            in_chwn_global_desc.Get1dIndex(
+                0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
        const Float* p_wei_global_block_begin =
            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
@@ -267,7 +268,8 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
            constexpr index_t N2 = GemmNPerThreadSubC;
            constexpr index_t N1 = NPerBlock / N2;
-            constexpr index_t W2 = (GemmNLevel0Cluster * GemmNLevel1Cluster) / (NPerBlock / GemmNPerThreadSubC);
+            constexpr index_t W2 =
+                (GemmNLevel0Cluster * GemmNLevel1Cluster) / (NPerBlock / GemmNPerThreadSubC);
            constexpr index_t W1 = WoPerBlock / W2;
            constexpr index_t K2 = GemmMPerThreadSubC;

--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -238,9 +238,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 1
                    blockwise_gemm.Run_asm
 #endif
-                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                        (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                     p_in_block + y * Wi + x,
+                         p_in_block + y * Wi + x,
-                     p_out_thread);
+                         p_out_thread);
                }
            }
        }

--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -387,13 +387,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
            constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence<K, B>{});
-            threadwise_6d_tensor_copy(out_6d_thread_desc,
+            threadwise_6d_tensor_copy(
-                                      p_out_thread,
+                out_6d_thread_desc,
-                                      out_6d_global_desc,
+                p_out_thread,
-                                      p_out_global + out_kb_global_desc.Get1dIndex(
+                out_6d_global_desc,
-                                                         k_thread_data_begin, b_thread_data_begin),
+                p_out_global +
-                                      out_6d_thread_desc.GetLengths(),
+                    out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin),
-                                      Number<OutThreadCopyDataPerWrite>{});
+                out_6d_thread_desc.GetLengths(),
+                Number<OutThreadCopyDataPerWrite>{});
        }
        else
 #endif

--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
+        blockwise_in_copy.Run(p_in_global +
-                                                                      c_block_work_begin,
+                                  in_global_desc.Get1dIndex(n_block_work_begin,
-                                                                      hi_block_work_begin,
+                                                            c_block_work_begin,
-                                                                      wi_block_work_begin),
+                                                            hi_block_work_begin,
+                                                            wi_block_work_begin),
                              p_in_block);
        // copy weight tensor to LDS
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    }
    // copy output tensor from LDS to device mem
-    blockwise_out_copy.Run(p_out_block,
+    blockwise_out_copy.Run(
-                           p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
+        p_out_block,
-                                                                     k_block_work_begin,
+        p_out_global +
-                                                                     ho_block_work_begin,
+            out_global_desc.Get1dIndex(
-                                                                     wo_block_work_begin));
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
 }
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -175,16 +175,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+        blockwise_in_copy.Run(p_in_global +
-                                                                           c_block_data_begin,
+                                  in_nchw_global_desc.Get1dIndex(n_block_data_begin,
-                                                                           hi_block_data_begin,
+                                                                 c_block_data_begin,
-                                                                           wi_block_data_begin),
+                                                                 hi_block_data_begin,
+                                                                 wi_block_data_begin),
                              p_in_block);
        // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
+        blockwise_wei_copy.Run(
-                                                  k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_global +
-                               p_wei_block);
+                wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_block);
        __syncthreads();
@@ -194,10 +196,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #if 1
            threadwise_direct_convolution_2(
                in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                p_in_block +
-                                                           c_thread_data,
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           hi_thread_data_begin,
+                                                  c_thread_data,
-                                                           wi_thread_data_begin),
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                wei_kcyx_thread_block_desc,
                p_wei_block +
                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -206,10 +209,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #elif 0
            threadwise_direct_convolution_3(
                in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                p_in_block +
-                                                           c_thread_data,
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           hi_thread_data_begin,
+                                                  c_thread_data,
-                                                           wi_thread_data_begin),
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                wei_kcyx_thread_block_desc,
                p_wei_block +
                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -224,9 +228,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
        out_nkhw_thread_desc,
        p_out_thread,
        out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+        p_out_global +
-                                                       k_block_data_begin + k_thread_data_begin,
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
        out_nkhw_thread_desc.GetLengths());
 }
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -198,9 +198,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                              p_in_vec_block);
        // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
+        blockwise_wei_copy.Run(
-                                                      k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_vec_global +
-                               p_wei_vec_block);
+                wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_vec_block);
        __syncthreads();
@@ -210,10 +211,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 1
            threadwise_direct_convolution_2(
                in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                p_in_vec_block +
-                                                                   c_thread_data,
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   hi_thread_data_begin,
+                                                      c_thread_data,
-                                                                   wi_thread_data_begin),
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                wei_kcyx_vec_thread_block_desc,
                p_wei_vec_block +
                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -222,10 +224,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 0
            threadwise_direct_convolution_3(
                in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                p_in_vec_block +
-                                                                   c_thread_data,
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   hi_thread_data_begin,
+                                                      c_thread_data,
-                                                                   wi_thread_data_begin),
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                wei_kcyx_vec_thread_block_desc,
                p_wei_vec_block +
                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -240,9 +243,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
        out_nkhw_thread_desc,
        p_out_thread,
        out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+        p_out_global +
-                                                       k_block_data_begin + k_thread_data_begin,
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
        out_nkhw_thread_desc.GetLengths());
 }
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
        out_hkwn_thread_desc,
        p_out_thread,
        out_khwn_global_desc,
-        p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+        p_out_global +
-                                                       ho_block_data_begin + ho_thread_data_begin,
+            out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+                                            wo_block_data_begin + wo_thread_data_begin,
+                                            n_block_data_begin + n_thread_data_begin),
        out_hkwn_thread_desc.GetLengths(),
        reorder_khwn_from_hkwn);
 }
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
    return os;
 }
-typedef enum
+typedef enum {
-{
    Half  = 0,
    Float = 1,
 } DataType_t;