Commit 97056d33 authored by Paul's avatar Paul
Browse files

Add gpu reduce_sum

parent d0c53a98
......@@ -35,6 +35,7 @@ add_library(migraphx_device
device/gather.cpp
device/sub.cpp
device/clip.cpp
device/reduce_sum.cpp
)
set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
rocm_clang_tidy_check(migraphx_device)
......@@ -70,6 +71,7 @@ add_library(migraphx_gpu
schedule_model.cpp
adjust_allocation.cpp
clip.cpp
reduce_sum.cpp
)
set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
rocm_clang_tidy_check(migraphx_gpu)
......
......@@ -35,6 +35,18 @@ inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
};
}
template<class F>
__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index idx) -> decltype(f(i, idx))
{
return f(i, idx);
}
template<class F>
__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i))
{
return f(i);
}
inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
{
std::size_t groups = 1 + n / local;
......@@ -44,7 +56,7 @@ inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 102
launch(stream, nglobal, local)([=](auto idx) {
for(size_t i = idx.global; i < n; i += nglobal)
{
f(i);
gs_invoke(f, i, idx);
}
});
};
......
#include <migraphx/gpu/device/reduce_sum.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
struct sum
{
template<class T>
MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y) const
{
return x + y;
}
};
template<std::size_t N, class Op, class T, class F>
__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
{
using type = decltype(f(idx.local));
MIGRAPHX_DEVICE_SHARED type buffer[N];
type x = init;
for(size_t i = idx.local; i < n; i += N)
{
x = op(x, f(i));
}
buffer[idx.local] = x;
__syncthreads();
for(std::size_t s = 1; s < N; s *= 2)
{
const std::size_t index = 2 * s * idx.local;
if (index < N)
{
buffer[index] = op(buffer[index], buffer[index + s]);
}
__syncthreads();
}
return buffer[0];
}
void reduce_sum(hipStream_t stream, const argument& result, const argument& arg)
{
auto&& output_shape = result.get_shape();
auto&& input_shape = arg.get_shape();
std::vector<std::size_t> reduce_lens;
std::transform(output_shape.lens().begin(), output_shape.lens().end(), input_shape.lens().begin(), std::back_inserter(reduce_lens), [](auto x, auto y) -> std::size_t {
if (x == y)
return 1;
else
return y;
});
shape reduce_slice{output_shape.type(), reduce_lens, input_shape.strides()};
hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
auto nelements = result.get_shape().elements();
auto relements = reduce_slice.elements();
const std::size_t block_size = 1024;
gs_launch(stream, nelements*block_size, block_size)([=](auto i, auto idx) __device__ {
auto base_idx = output.get_shape().multi(i/block_size);
auto offset = input.get_shape().index(base_idx);
auto r = block_reduce<block_size>(idx, sum{}, 0, relements, [&](auto j) __device__ {
return input.data()[reduce_shape.index(j) + offset];
});
if (idx.local == 0)
output.data()[i/block_size] = r;
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void reduce_sum(hipStream_t stream, const argument& result, const argument& arg);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
#define MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
#include <migraphx/shape.hpp>
#include <migraphx/op/reduce_sum.hpp>
#include <migraphx/reflect.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
struct hip_reduce_sum
{
op::reduce_sum op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "gpu::reduce_sum"; }
shape compute_shape(std::vector<shape> inputs) const;
argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -47,6 +47,7 @@
#include <migraphx/gpu/lrn.hpp>
#include <migraphx/gpu/convert.hpp>
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/reduce_sum.hpp>
#include <utility>
#include <functional>
#include <algorithm>
......@@ -105,6 +106,7 @@ struct miopen_apply
add_extend_op<hip_pad, op::pad>("pad");
add_extend_op<hip_convert, op::convert>("convert");
add_extend_op<hip_clip, op::clip>("clip");
add_extend_op<hip_reduce_sum, op::reduce_sum>("reduce_sum");
add_lrn_op();
add_convolution_op();
......
......@@ -3445,4 +3445,16 @@ struct test_fp32_fp16_sub : verify_program<test_fp32_fp16_sub>
};
};
struct test_reduce_sum : verify_program<test_reduce_sum>
{
migraphx::program create_program() const
{
migraphx::program p;
migraphx::shape s{migraphx::shape::float_type, {3, 4, 8, 8}};
auto x = p.add_parameter("x", s);
p.add_instruction(migraphx::op::reduce_sum{{1}}, x);
return p;
};
};
int main(int argc, const char* argv[]) { test::run(argc, argv); }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment