Merge pull request #293 from ROCmSoftwarePlatform/opt_log_softmax

Opt log softmax

Merge pull request #293 from ROCmSoftwarePlatform/opt_log_softmax
Opt log softmax
51f264a6 · mvermeulen · GitHub · 5af04e80 · c98b06a1 · 51f264a6
Unverified Commit 51f264a6 authored Jul 02, 2019 by mvermeulen Committed by GitHub Jul 02, 2019
12 changed files
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -99,6 +99,8 @@ struct shape
    /// Map element index to space index
    std::size_t index(std::size_t i) const;

+    std::vector<std::size_t> multi(std::size_t i) const;
+
    /// Returns true if the shape is packed with no padding
    bool packed() const;
    /// Returns true is the shape has been transposed. That is the strides are not in descending

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -138,6 +138,24 @@ std::size_t shape::index(std::size_t i) const
        return result;
    }
 }
+
+std::vector<std::size_t> shape::multi(std::size_t i) const
+{
+    assert(this->standard());
+
+    std::vector<std::size_t> indices(lens().size());
+    std::transform(strides().begin(),
+                   strides().end(),
+                   lens().begin(),
+                   indices.begin(),
+                   [&](std::size_t stride, std::size_t len) {
+                       assert(len > 0 and stride > 0);
+                       return (i / stride) % len;
+                   });
+
+    return indices;
+}
+
 bool shape::packed() const { return this->elements() == this->element_space(); }

 bool shape::transposed() const

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -539,18 +539,11 @@ struct cpu_softmax

    std::string name() const { return "cpu::softmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    template <typename T>
-    std::size_t compute_batch_index(T idx, shape& batch_shape, int axis) const
-    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
-    }
-
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        std::size_t n_dims  = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

@@ -558,26 +551,33 @@ struct cpu_softmax
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = batch_shape.multi(i);
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) =
-                    std::exp(input(idx.begin(), idx.end()) - batch_max[index]);
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]      = j;
+                    std::size_t index = output_shape.index(idx);
+                    output[index]     = std::exp(input[index] - batch_max[i]);
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += output(idx.begin(), idx.end());
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += output(idx.begin(), idx.end());
+                }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) /= batch_sum[index];
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) /= batch_sum[i];
+                }
            });
        });

@@ -597,49 +597,50 @@ struct cpu_logsoftmax

    std::string name() const { return "cpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    template <typename T>
-    std::size_t compute_batch_index(T idx, const shape& batch_shape, int axis) const
-    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
-    }
-
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        std::size_t n_dims  = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

+        // use a parallel implementation to acheive better performance
+        // one thread for one batch
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
-            });
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = batch_shape.multi(i);
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
-            });
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]      = j;
+                    std::size_t index = output_shape.index(idx);
+                    output[index]     = input[index] - batch_max[i];
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += std::exp(output(idx.begin(), idx.end()));
+                }

-            for(std::size_t i = 0; i < batch_sum.size(); ++i)
-            {
                batch_sum[i] = std::log(batch_sum[i]);
-            }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) -= batch_sum[index];
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) -= batch_sum[i];
+                }
            });
        });


--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -73,7 +73,7 @@ __host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i)

 inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
 {
-    std::size_t groups  = 1 + n / local;
+    std::size_t groups  = (n + local - 1) / local;
    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;

    return [=](auto f) {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -119,13 +119,13 @@ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
 }

 template <class T>
-T to_hip_type(T x)
+__device__ __host__ T to_hip_type(T x)
 {
    return x;
 }

 // Hip doens't support __fp16
-inline float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -11,53 +12,45 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream, argument result, argument arg, int axis)
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-
-    auto lens         = result.get_shape().lens();
-    auto num_in_batch = lens[axis];
-    auto batch_lens   = lens;
-    batch_lens[axis]  = 1;
-    shape batch_shape{result.get_shape().type(), batch_lens};
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-
-        // each thread is for one item in the batch
-        gs_launch(stream, batch_shape.elements())([=](auto i) {
-            auto batch_idx = batch.multi(i);
-            auto data_idx  = batch_idx;
-
-            // get max
-            auto batch_max = input[batch_idx];
-            for(std::size_t j = 1; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input[data_idx]));
-            }
-
-            for(std::size_t j = 0; j < num_in_batch; ++j)
-            {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
+
+            idx.local_stride(batch_item_num, [&](auto j) {
                data_idx[axis]   = j;
-                output[data_idx] = input[data_idx] - batch_max;
-            }
-
-            auto batch_sum = ::exp(to_hip_type(output[batch_idx]));
-            for(std::size_t j = 1; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                batch_sum += ::exp(to_hip_type(output[data_idx]));
-            }
-            batch_sum = ::log(to_hip_type(batch_sum));
-
-            for(std::size_t j = 0; j < num_in_batch; ++j)
-            {
-                data_idx[axis] = j;
-                output[data_idx] -= batch_sum;
-            }
+                output[data_idx] = input[data_idx] - log_batch_sum;
+            });
        });
    });
-
-    return result;
 }

 } // namespace device

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -12,51 +13,44 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream, argument result, argument arg, int axis)
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-    auto lens        = result.get_shape().lens();
-    auto batch_lens  = lens;
-    size_t n_dims    = lens[axis];
-    batch_lens[axis] = 1;
-    shape batch_shape{result.get_shape().type(), batch_lens};
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-
-        // each thread is for one item in the batch
-        gs_launch(stream, batch_shape.elements())([=](auto i) {
-            auto batch_idx = batch.multi(i);
-            auto data_idx  = batch_idx;
-
-            // get max
-            auto batch_max = input[batch_idx];
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                data_idx[axis] = j;
-                batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input[data_idx]));
-            }
-
-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            idx.local_stride(batch_item_num, [&](auto j) {
                data_idx[axis]   = j;
-                output[data_idx] = exp(to_hip_type(input[data_idx] - batch_max));
-            }
-
-            auto batch_sum = output[batch_idx];
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                data_idx[axis] = j;
-                batch_sum += output[data_idx];
-            }
-
-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
-                data_idx[axis]   = j;
-                output[data_idx] = output[data_idx] / batch_sum;
-            }
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+            });
        });
    });
-
-    return result;
 }

 } // namespace device

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream, argument result, argument arg, int axis);
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream, argument result, argument arg, int axis);
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
@@ -18,7 +18,8 @@ shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
 argument
 hip_logsoftmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::logsoftmax(ctx.get_stream().get(), args[1], args[0], op.axis);
+    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
 }

 } // namespace gpu

--- a/src/targets/gpu/softmax.cpp
+++ b/src/targets/gpu/softmax.cpp
@@ -39,7 +39,8 @@ shape hip_softmax::compute_shape(const std::vector<shape>& inputs) const

 argument hip_softmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::softmax(ctx.get_stream().get(), args[1], args[0], op.axis);
+    device::softmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
 }

 } // namespace gpu

--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
@@ -592,13 +592,13 @@ struct test_softmax2 : verify_program<test_softmax2>
    }
 };

-template <int Axis>
-struct test_softmax : verify_program<test_softmax<Axis>>
+template <int Axis, migraphx::shape::type_t T>
+struct test_softmax : verify_program<test_softmax<Axis, T>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3, 4, 5, 6}};
+        migraphx::shape s{T, {512, 4, 1067, 6}};
        auto param = p.add_parameter("0", s);
        p.add_instruction(migraphx::op::softmax{Axis}, param);

@@ -606,10 +606,14 @@ struct test_softmax : verify_program<test_softmax<Axis>>
    }
 };

-template struct test_softmax<0>;
-template struct test_softmax<1>;
-template struct test_softmax<2>;
-template struct test_softmax<3>;
+template struct test_softmax<0, migraphx::shape::float_type>;
+template struct test_softmax<2, migraphx::shape::float_type>;
+template struct test_softmax<1, migraphx::shape::double_type>;
+template struct test_softmax<3, migraphx::shape::double_type>;
+template struct test_softmax<0, migraphx::shape::half_type>;
+template struct test_softmax<1, migraphx::shape::half_type>;
+template struct test_softmax<2, migraphx::shape::half_type>;
+template struct test_softmax<3, migraphx::shape::half_type>;

 struct test_conv : verify_program<test_conv>
 {
@@ -3345,12 +3349,12 @@ struct test_lstm_bidirct_default_actv2 : verify_program<test_lstm_bidirct_defaul
 };

 template <int Axis>
-struct test_logsoftmax : verify_program<test_logsoftmax<Axis>>
+struct test_logsoftmax_1 : verify_program<test_logsoftmax_1<Axis>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3, 4, 5, 6}};
+        migraphx::shape s{migraphx::shape::float_type, {3}};
        auto param = p.add_parameter("0", s);
        p.add_instruction(migraphx::op::logsoftmax{Axis}, param);

@@ -3358,18 +3362,15 @@ struct test_logsoftmax : verify_program<test_logsoftmax<Axis>>
    }
 };

-template struct test_logsoftmax<0>;
-template struct test_logsoftmax<1>;
-template struct test_logsoftmax<2>;
-template struct test_logsoftmax<3>;
+template struct test_logsoftmax_1<0>;

-template <int Axis>
-struct test_logsoftmax_1 : verify_program<test_logsoftmax_1<Axis>>
+template <int Axis, migraphx::shape::type_t T>
+struct test_logsoftmax : verify_program<test_logsoftmax<Axis, T>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        migraphx::shape s{migraphx::shape::float_type, {3}};
+        migraphx::shape s{T, {10, 4, 2080, 6}};
        auto param = p.add_parameter("0", s);
        p.add_instruction(migraphx::op::logsoftmax{Axis}, param);

@@ -3377,7 +3378,16 @@ struct test_logsoftmax_1 : verify_program<test_logsoftmax_1<Axis>>
    }
 };

-template struct test_logsoftmax_1<0>;
+template struct test_logsoftmax<0, migraphx::shape::float_type>;
+template struct test_logsoftmax<1, migraphx::shape::float_type>;
+template struct test_logsoftmax<2, migraphx::shape::float_type>;
+template struct test_logsoftmax<3, migraphx::shape::float_type>;
+template struct test_logsoftmax<1, migraphx::shape::double_type>;
+template struct test_logsoftmax<3, migraphx::shape::double_type>;
+template struct test_logsoftmax<1, migraphx::shape::half_type>;
+template struct test_logsoftmax<0, migraphx::shape::half_type>;
+template struct test_logsoftmax<2, migraphx::shape::half_type>;
+template struct test_logsoftmax<3, migraphx::shape::half_type>;

 struct test_fp32_fp16_lall : verify_program<test_fp32_fp16_lall>
 {