clang format

401d0f68 · Shucai Xiao · 992f57ba · 401d0f68 · 401d0f68
Commit 401d0f68 authored Apr 11, 2022 by Shucai Xiao
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 8 deletions

src/simplify_reshapes.cpp src/simplify_reshapes.cpp +4 -2

src/targets/gpu/device/softmax.cpp src/targets/gpu/device/softmax.cpp +8 -6

No files found.
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -122,11 +122,13 @@ struct find_nop_reshapes
        auto ins = mr.result;
        // // output of reshape and contiguous is standard, so no need to add another contiguous
        // // if the output is used an a ret value
-        // if(ins->name() == "contiguous" and ins->name() != "contiguous" and ins->name() != "reshape")
+        // if(ins->name() == "contiguous" and ins->name() != "contiguous" and ins->name() !=
+        // "reshape")
        // {
        //     auto& outputs = ins->outputs();
        //     if(std::any_of(
-        //            outputs.begin(), outputs.end(), [&](auto o) { return o->name() == "@return"; }))
+        //            outputs.begin(), outputs.end(), [&](auto o) { return o->name() == "@return";
+        //            }))
        //     {
        //         return;
        //     }

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -35,8 +35,8 @@ struct half2_max
 // in_data is in shared memory
 template <class Op>
-__device__ __half2
+__device__ __half2 block_reduce_half2(
-block_reduce_half2(__half2* buffer, index_int batch_item_num, index_int tid, index_int block_size, Op op)
+    __half2* buffer, index_int batch_item_num, index_int tid, index_int block_size, Op op)
 {
    __syncthreads();
    for(index_int s = block_size; s > 0; s >>= 1)
@@ -92,8 +92,8 @@ softmax_kernel_half2(void* data_in, index_int batch_item_num, index_int block_si
 // in_data is in shared memory
 template <class Op>
-__device__ __half
+__device__ __half block_reduce_half(
-block_reduce_half(__half* data, index_int batch_item_num, index_int tid, index_int block_size, Op op)
+    __half* data, index_int batch_item_num, index_int tid, index_int block_size, Op op)
 {
    __syncthreads();
    for(index_int s = block_size / 2; s > 0; s >>= 1)
@@ -125,14 +125,16 @@ softmax_kernel_half(void* data_in, index_int batch_item_num, index_int block_siz
        in_data_reduce[i] = d;
    }
-    auto batch_max = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, max{});
+    auto batch_max =
+        block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, max{});
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
        in_data[i]        = __float2half(::exp(__half2float(in_data[i]) - __half2float(batch_max)));
        in_data_reduce[i] = in_data[i];
    }
-    auto batch_sum = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, sum{});
+    auto batch_sum =
+        block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, sum{});
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
        output[i + start] = __float2half(__half2float(in_data[i]) / __half2float(batch_sum));