clang format

3272b22e · Shucai Xiao · 94e3a2e4 · 3272b22e · 3272b22e · 3272b22e
Commit 3272b22e authored Feb 12, 2022 by Shucai Xiao
20 changed files
--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -150,9 +150,9 @@ struct memory_coloring_impl
    // Priority queue for coloring.
    std::priority_queue<interval_ptr, std::vector<interval_ptr>, ordering> alloc_queue{};
-    int num_of_lives           = 0;
+    int num_of_lives     = 0;
-    int max_value_number       = -1;
+    int max_value_number = -1;
-    int required_bytes = 0;
+    int required_bytes   = 0;
    // The earliest program point where an live interval ends.
    int earliest_end_point = -1;
    // The latest program point where an live interval ends.

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -565,8 +565,8 @@ void program::from_value(const value& v)
 double common_average(const std::vector<double>& v)
 {
-    int n = v.size() / 4;
+    int n        = v.size() / 4;
-    double total  = std::accumulate(v.begin() + n, v.end() - n, 0.0);
+    double total = std::accumulate(v.begin() + n, v.end() - n, 0.0);
    return total / std::distance(v.begin() + n, v.end() - n);
 }
@@ -596,10 +596,7 @@ void program::mark(const parameter_map& params, marker&& m)
    m.mark_stop(*this);
 }
-void program::perf_report(std::ostream& os,
+void program::perf_report(std::ostream& os, int n, parameter_map params, int batch) const
-                          int n,
-                          parameter_map params,
-                          int batch) const
 {
    auto& ctx = this->impl->ctx;
    // Run once by itself

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -337,8 +337,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
          },
          "Parse onnx file",
          py::arg("filename"),
-          py::arg("default_dim_value") = 1,
+          py::arg("default_dim_value")      = 1,
-          py::arg("map_input_dims") = std::unordered_map<std::string, std::vector<int>>(),
+          py::arg("map_input_dims")         = std::unordered_map<std::string, std::vector<int>>(),
          py::arg("skip_unknown_operators") = false,
          py::arg("print_program_on_error") = false,
          py::arg("max_loop_iterations")    = 10);
@@ -358,8 +358,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
          },
          "Parse onnx file",
          py::arg("filename"),
-          py::arg("default_dim_value") = 1,
+          py::arg("default_dim_value")      = 1,
-          py::arg("map_input_dims") = std::unordered_map<std::string, std::vector<int>>(),
+          py::arg("map_input_dims")         = std::unordered_map<std::string, std::vector<int>>(),
          py::arg("skip_unknown_operators") = false,
          py::arg("print_program_on_error") = false);

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -578,8 +578,8 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
    auto trh = prog.insert_instruction(ins, make_op("transpose", {{"permutation", perm}}), rh);
    // initial states
-    auto sih  = prog.insert_instruction(ins, make_op("squeeze", {{"axes", {0}}}), ih);
+    auto sih = prog.insert_instruction(ins, make_op("squeeze", {{"axes", {0}}}), ih);
-    int bs = ih->get_shape().lens()[1];
+    int bs   = ih->get_shape().lens()[1];
    // bias
    instruction_ref bwb{};

--- a/src/schedule.cpp
+++ b/src/schedule.cpp
@@ -50,7 +50,7 @@ struct stream_info
            if(not contains(weights, ins))
            {
                int weight = 0;
-                auto&& op          = ins->get_operator();
+                auto&& op  = ins->get_operator();
                if(not is_context_free(op) and op.name()[0] != '@')
                    weight = model.weight(op);
                // This will ensure a stream will be assigned to return

--- a/src/targets/cpu/gather.cpp
+++ b/src/targets/cpu/gather.cpp
@@ -29,10 +29,10 @@ struct cpu_gather : auto_register_op<cpu_gather>
    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        int nelements = output_shape.elements();
+        int nelements      = output_shape.elements();
-        auto lens             = args[0].get_shape().lens();
+        auto lens          = args[0].get_shape().lens();
-        auto axis_dim_size    = lens[op.axis];
+        auto axis_dim_size = lens[op.axis];
-        lens[op.axis]         = args[1].get_shape().elements();
+        lens[op.axis]      = args[1].get_shape().elements();
        shape out_comp{output_shape.type(), lens};
        visit_all(args.back(), args[0])([&](auto output, auto input) {

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -103,9 +103,7 @@ struct cpu_im2col
                    // compute linear index for output
                    int ldx = ioutput * col_width + joutput;
                    int p   = 0;
-                    dfor(channels,
+                    dfor(channels, kernel_h, kernel_w)([&](int c, int koffset, int loffset) {
-                         kernel_h,
-                         kernel_w)([&](int c, int koffset, int loffset) {
                        auto idx    = iinput + long(koffset) - kdiv2_h;
                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
@@ -12,24 +12,24 @@ namespace device {
 void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
 {
-    auto comp_shape    = arg.get_shape();
+    auto comp_shape = arg.get_shape();
-    auto out_lens      = comp_shape.lens();
+    auto out_lens   = comp_shape.lens();
-    auto dim_0         = out_lens.size() - 2;
+    auto dim_0      = out_lens.size() - 2;
-    auto dim_1         = out_lens.size() - 1;
+    auto dim_1      = out_lens.size() - 1;
-    int lda    = comp_shape.strides()[dim_0];
+    int lda         = comp_shape.strides()[dim_0];
-    int m_size = out_lens[dim_0] * out_lens[dim_1];
+    int m_size      = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
        int nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
+        auto* out_ptr = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
+        auto* in_ptr  = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const int nb    = 4;
+                const int nb = 4;
-                auto idx           = desc.multi(ii);
+                auto idx     = desc.multi(ii);
-                int i_m    = idx[dim_1];
+                int i_m      = idx[dim_1];
-                int i_k    = idx[dim_0];
+                int i_k      = idx[dim_0];
-                int offset = ii / m_size * m_size;
+                int offset   = ii / m_size * m_size;
                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
                    in_ptr[i_m + i_k * lda + offset];
            });
@@ -43,7 +43,7 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
    auto out_lens    = trans_shape.lens();
    auto dim_0       = trans_shape.lens().size() - 2;
    auto dim_1       = trans_shape.lens().size() - 1;
-    int ldb  = trans_shape.strides()[dim_1];
+    int ldb          = trans_shape.strides()[dim_1];
    auto wrap_lens = out_lens;
    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
@@ -51,16 +51,16 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
    int m_size = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
        int nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
+        auto* out_ptr = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
+        auto* in_ptr  = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const int nb    = 4;
+                const int nb = 4;
-                auto idx           = desc.multi(ii);
+                auto idx     = desc.multi(ii);
-                int i_n    = idx[dim_1];
+                int i_n      = idx[dim_1];
-                int i_k    = idx[dim_0];
+                int i_k      = idx[dim_0];
-                int offset = ii / m_size * m_size;
+                int offset   = ii / m_size * m_size;
                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
                    in_ptr[i_n + i_k * ldb + offset];
            });

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -79,12 +79,8 @@ __device__ auto auto_block_reduce(index idx, Op op, T init, index_int n, F f)
 }
 template <index_int MaxBlockSize, class Input, class Output>
-__device__ void layernorm(index_int i,
+__device__ void layernorm(
-                          index idx,
+    index_int i, index idx, int block_size_div, index_int relements, Input input, Output output)
-                          int block_size_div,
-                          index_int relements,
-                          Input input,
-                          Output output)
 {
    using value_type       = decltype(input(idx.local));
    const auto relements_v = relements / vector_size<value_type>{};
@@ -128,7 +124,7 @@ void layernorm_vec_impl(hipStream_t stream,
                        const Arguments&... args)
 {
    hip_vec_visit_all<N>(result, args...)([&](auto output, auto... inputs) {
-        const auto relements_v           = relements / N;
+        const auto relements_v   = relements / N;
        const int max_block_size = 256;
        const int block_size     = compute_block_size(relements_v, max_block_size);
        const int block_size_div = encode_divisor(block_size);

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -597,7 +597,7 @@ struct miopen_fusion
        // Compensate for allocation
        inputs.pop_back();
        int i = 0;
-        f             = fusion(inputs[i]);
+        f     = fusion(inputs[i]);
        i++;
        std::vector<std::function<void(const fused_operator_args&, const std::vector<argument>&)>>
            invokers;

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -89,8 +89,8 @@ void gemm_impl(context& ctx,
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
-        auto num_matrices = std::accumulate(
+        auto num_matrices =
-            out_lens.rbegin() + 2, out_lens.rend(), int{1}, std::multiplies<int>());
+            std::accumulate(out_lens.rbegin() + 2, out_lens.rend(), int{1}, std::multiplies<int>());
        if(num_matrices == 1)
        {
            // the rocblas_gemm API handles inputs and output matrices as

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -124,7 +124,7 @@ struct hip_device
        }
        private:
-        int id                      = 0;
+        int id                              = 0;
        shared<hip_stream_ptr> s            = nullptr;
        shared<miopen_handle> mihandle      = nullptr;
        shared<rocblas_handle_ptr> rbhandle = nullptr;
@@ -187,10 +187,7 @@ struct context
    hip_device::stream& get_stream(int n) { return get_current_device().get_stream(n); }
    const hip_device::stream& get_stream() const { return get_current_device().get_stream(); }
-    const hip_device::stream& get_stream(int n) const
+    const hip_device::stream& get_stream(int n) const { return get_current_device().get_stream(n); }
-    {
-        return get_current_device().get_stream(n);
-    }
    void set_stream(int n) { get_current_device().set_stream(n); }
@@ -225,12 +222,12 @@ struct context
    void from_value(const value& v)
    {
-        auto v_events        = v.at("events");
+        auto v_events = v.at("events");
-        int n_events = v_events.without_key().to<int>();
+        int n_events  = v_events.without_key().to<int>();
        this->create_events(n_events - 1);
-        auto v_streams        = v.at("streams");
+        auto v_streams = v.at("streams");
-        int n_streams = v_streams.without_key().to<int>();
+        int n_streams  = v_streams.without_key().to<int>();
        this->current_device = std::make_shared<hip_device>(0, n_streams);
    }

--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
@@ -71,10 +71,10 @@ struct argmin_op
 template <class Op>
 void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
-    auto arg_shape        = arg.get_shape();
+    auto arg_shape     = arg.get_shape();
-    auto batch_lens       = arg_shape.lens();
+    auto batch_lens    = arg_shape.lens();
    int batch_item_num = batch_lens[axis];
-    batch_lens[axis]      = 1;
+    batch_lens[axis]   = 1;
    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
    migraphx::shape std_arg_shape{arg_shape.type(), arg_shape.lens()};
@@ -82,8 +82,8 @@ void arg_op(Op op, hipStream_t stream, const argument& result, const argument& a
        auto* output = device_cast(result.get<int64_t>().data());
        using type   = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
        // use one block for items in one batch.
-        const int max_block_size  = 256;
+        const int max_block_size = 256;
-        const int block_size = compute_block_size(batch_item_num, max_block_size);
+        const int block_size     = compute_block_size(batch_item_num, max_block_size);
        gs_launch(stream,
                  batch_shape.elements() * block_size,
                  block_size)([=](auto i, auto idx) __device__ {

--- a/src/targets/gpu/include/migraphx/gpu/kernel.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -29,10 +29,7 @@ struct kernel
                int local,
                const std::vector<kernel_argument>& args) const;
-    void launch(hipStream_t stream,
+    void launch(hipStream_t stream, int global, int local, std::vector<void*> args) const;
-                int global,
-                int local,
-                std::vector<void*> args) const;
    auto launch(hipStream_t stream, int global, int local) const
    {

--- a/src/targets/gpu/kernel.cpp
+++ b/src/targets/gpu/kernel.cpp
@@ -52,12 +52,8 @@ kernel::kernel(const char* image, const std::string& name) : impl(std::make_shar
        MIGRAPHX_THROW("Failed to get function: " + name + ": " + hip_error(status));
 }
-void launch_kernel(hipFunction_t fun,
+void launch_kernel(
-                   hipStream_t stream,
+    hipFunction_t fun, hipStream_t stream, int global, int local, void* kernargs, int size)
-                   int global,
-                   int local,
-                   void* kernargs,
-                   int size)
 {
    void* config[] = {
 // HIP_LAUNCH_PARAM_* are macros that do horrible things
@@ -78,14 +74,11 @@ void launch_kernel(hipFunction_t fun,
        MIGRAPHX_THROW("Failed to launch kernel: " + hip_error(status));
 }
-void kernel::launch(hipStream_t stream,
+void kernel::launch(hipStream_t stream, int global, int local, std::vector<void*> args) const
-                    int global,
-                    int local,
-                    std::vector<void*> args) const
 {
    assert(impl != nullptr);
-    void* kernargs   = args.data();
+    void* kernargs = args.data();
-    int size = args.size() * sizeof(void*);
+    int size       = args.size() * sizeof(void*);
    launch_kernel(impl->fun, stream, global, local, kernargs, size);
 }
@@ -97,7 +90,7 @@ void kernel::launch(hipStream_t stream,
 {
    assert(impl != nullptr);
    std::vector<char> kernargs = pack_args(args);
-    int size           = kernargs.size();
+    int size                   = kernargs.size();
    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -43,10 +43,8 @@ struct avg_pool
 };
 template <class T, class Op>
-MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
+MIGRAPHX_DEVICE_CONSTEXPR T
-                                                 const array<int, 2>& dims,
+bilinear_interpolate(const T* data, const array<int, 2>& dims, array<float, 2> xy, Op pooling)
-                                                 array<float, 2> xy,
-                                                 Op pooling)
 {
    array<int, 2> low{};
    array<int, 2> high{};
@@ -66,9 +64,9 @@ MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
        }
    }
    array<int, 4> locs = {low[0] * dims[1] + low[1],
-                                  low[0] * dims[1] + high[1],
+                          low[0] * dims[1] + high[1],
-                                  high[0] * dims[1] + low[1],
+                          high[0] * dims[1] + low[1],
-                                  high[0] * dims[1] + high[1]};
+                          high[0] * dims[1] + high[1]};
    float ly       = xy[0] - low[0];
    float lx       = xy[1] - low[1];
@@ -142,7 +140,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
    // output dims of height and width, in all 2-dim arrays, the first dim
    // is for height and second dim is for width
-    const auto& out_lens           = out_s.lens;
+    const auto& out_lens   = out_s.lens;
    array<int, 2> out_dims = {out_lens[2], out_lens[3]};
    for(index_int i = index.global; i < out_s.elements(); i += stride)

--- a/src/targets/gpu/mlir_conv.cpp
+++ b/src/targets/gpu/mlir_conv.cpp
@@ -225,7 +225,7 @@ struct mlir_apply
    void add_memref_descriptor(std::vector<instruction_ref>& refs, instruction_ref inst)
    {
        const int offset = 0;
-        auto inst_t         = inst->get_shape();
+        auto inst_t      = inst->get_shape();
        refs.push_back(inst);
        refs.push_back(inst);
        refs.push_back(get_literal(offset)); // offset

--- a/src/targets/gpu/pack_args.cpp
+++ b/src/targets/gpu/pack_args.cpp
@@ -10,7 +10,7 @@ std::vector<char> pack_args(const std::vector<kernel_argument>& args)
    std::vector<char> kernargs;
    for(auto&& arg : args)
    {
-        int n = arg.size;
+        int n         = arg.size;
        const auto* p = static_cast<const char*>(arg.data);
        // Insert padding
        int padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;

--- a/src/targets/ref/gemm.cpp
+++ b/src/targets/ref/gemm.cpp
@@ -20,8 +20,14 @@ static auto make_mat(tensor_view<T> x)
    int dim_0  = n_dims - 2;
    int dim_1  = n_dims - 1;
    if(s.transposed())
-        return matrix<T>{x.data(), static_cast<std::size_t>(s.lens()[dim_1]), static_cast<std::size_t>(s.lens()[dim_0]), static_cast<std::size_t>(s.strides()[dim_1])};
+        return matrix<T>{x.data(),
-    return matrix<T>{x.data(), static_cast<std::size_t>(s.lens()[dim_0]), static_cast<std::size_t>(s.lens()[dim_1]), static_cast<std::size_t>(s.strides()[dim_0])};
+                         static_cast<std::size_t>(s.lens()[dim_1]),
+                         static_cast<std::size_t>(s.lens()[dim_0]),
+                         static_cast<std::size_t>(s.strides()[dim_1])};
+    return matrix<T>{x.data(),
+                     static_cast<std::size_t>(s.lens()[dim_0]),
+                     static_cast<std::size_t>(s.lens()[dim_1]),
+                     static_cast<std::size_t>(s.strides()[dim_0])};
 }
 template <class T, class F>

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -317,9 +317,7 @@ struct ref_im2col
                    // compute linear index for output
                    int ldx = ioutput * col_width + joutput;
                    int p   = 0;
-                    dfor(channels,
+                    dfor(channels, kernel_h, kernel_w)([&](int c, int koffset, int loffset) {
-                         kernel_h,
-                         kernel_w)([&](int c, int koffset, int loffset) {
                        auto idx    = iinput + long(koffset) - kdiv2_h;
                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
@@ -650,7 +648,7 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
        argument result{output_shape};
        auto batch_lens        = output_shape.lens();
        int64_t tuned_axis     = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
-        int n_dims     = batch_lens[tuned_axis];
+        int n_dims             = batch_lens[tuned_axis];
        batch_lens[tuned_axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};
@@ -670,9 +668,9 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
                for(int j = 0; j < n_dims; ++j)
                {
-                    idx[tuned_axis]   = j;
+                    idx[tuned_axis] = j;
-                    int index = output_shape.index(idx);
+                    int index       = output_shape.index(idx);
-                    output[index]     = std::exp(input[index] - batch_max[i]);
+                    output[index]   = std::exp(input[index] - batch_max[i]);
                }
                for(int j = 0; j < n_dims; ++j)