Commit 7e297b13 authored by Paul's avatar Paul
Browse files

Merge

parents 86ea5e91 aa7ff911
...@@ -3,31 +3,52 @@ ...@@ -3,31 +3,52 @@
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp> #include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/math.hpp>
#include <migraphx/kernels/preload.hpp> #include <migraphx/kernels/preload.hpp>
#include <migraphx/kernels/vectorize.hpp> #include <migraphx/kernels/vectorize.hpp>
#include <migraphx/kernels/args.hpp> #include <migraphx/kernels/args.hpp>
namespace migraphx { namespace migraphx {
template <class T>
struct implicit_conversion_op
{
T x;
template <index_int N, class U>
constexpr operator vec<U, N>() const
{
static_assert(vec_size<T>() == N, "Vector mismatch size");
return __builtin_convertvector(x, vec<U, N>);
}
template <class U>
constexpr operator U() const
{
return x;
}
};
template <class T>
constexpr implicit_conversion_op<T> implicit_conversion(T x)
{
return {x};
}
template <class F, class T, class... Ts> template <class F, class T, class... Ts>
__device__ void pointwise_tensor(index idx, F f, T out, Ts... xs) __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
{ {
preload<typename T::type>(idx, xs...)([&](auto... ps) { idx.global_stride(out.get_shape().elements(),
idx.global_stride(out.get_shape().elements(), [&](auto i) { [&](auto i) { out[i] = implicit_conversion(f(xs[i]...)); });
auto multi_idx = out.get_shape().multi(i);
out[multi_idx] = f(ps[multi_idx]...);
});
});
} }
template <class F, class... Ts> template <class... Transforms>
__device__ void pointwise(F f, Ts*... ps) __device__ auto pointwise(index idx, Transforms... transforms)
{ {
auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize()); return [=](auto f, auto*... ps) {
t(ps...)([&](auto... xs) { auto t = transform_args(make_tensors(), rotate_last(), transforms...);
auto idx = make_index(); t(ps...)([&](auto... xs) { pointwise_tensor(idx, f, xs...); });
pointwise_tensor(idx, f, xs...); };
});
} }
} // namespace migraphx } // namespace migraphx
......
...@@ -3,16 +3,25 @@ ...@@ -3,16 +3,25 @@
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp> #include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/vec.hpp>
namespace migraphx { namespace migraphx {
template <class Shape> template <class T>
constexpr bool is_preloadable() struct remove_vec_impl
{ {
Shape s{}; using type = T;
if(not s.broadcasted()) };
return false;
} template <class T, index_int N>
struct remove_vec_impl<vec<T, N>>
{
using type = T;
};
template <class T>
using remove_vec = typename remove_vec_impl<T>::type;
template <class T, class... Shapes> template <class T, class... Shapes>
constexpr auto traverse_preload(Shapes... ss) constexpr auto traverse_preload(Shapes... ss)
...@@ -20,11 +29,11 @@ constexpr auto traverse_preload(Shapes... ss) ...@@ -20,11 +29,11 @@ constexpr auto traverse_preload(Shapes... ss)
return [=](auto f, auto... g) { return [=](auto f, auto... g) {
index_int offset = 0; index_int offset = 0;
auto each = [&](auto x) { auto each = [&](auto x) {
using type = remove_vec<typename decltype(x)::type>;
constexpr auto s = decltype(x.get_shape()){}; constexpr auto s = decltype(x.get_shape()){};
constexpr auto size = _c<s.element_space()>; constexpr auto size = s.element_space();
if constexpr(not s.broadcasted()) if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
return f(x, offset, false_type{}); not is_same<T, type>{})
else if constexpr((s.elements() - size) < 64)
return f(x, offset, false_type{}); return f(x, offset, false_type{});
else else
{ {
...@@ -39,7 +48,7 @@ constexpr auto traverse_preload(Shapes... ss) ...@@ -39,7 +48,7 @@ constexpr auto traverse_preload(Shapes... ss)
} }
template <class T, class... Shapes> template <class T, class... Shapes>
constexpr index_int compute_preload_size(Shapes...) constexpr index_int compute_preload_size_c(Shapes...)
{ {
index_int size = 0; index_int size = 0;
traverse_preload<T>(Shapes{}...)( traverse_preload<T>(Shapes{}...)(
...@@ -47,6 +56,12 @@ constexpr index_int compute_preload_size(Shapes...) ...@@ -47,6 +56,12 @@ constexpr index_int compute_preload_size(Shapes...)
return size; return size;
} }
template <class T, class... Shapes>
constexpr auto compute_preload_size(Shapes...)
{
return _c<compute_preload_size_c<T>(Shapes{}...)>;
}
template <class F, class T, class... Ts> template <class F, class T, class... Ts>
__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs) __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
{ {
...@@ -58,11 +73,21 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs) ...@@ -58,11 +73,21 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
[&](auto x, auto offset, auto copy) { [&](auto x, auto offset, auto copy) {
if constexpr(copy) if constexpr(copy)
{ {
auto v = vectorize(x); if constexpr(decltype(tensor_vec_size(x)){} == 0)
auto b = as_vec(tensor_vec_size(v), buffer + offset); {
idx.local_stride(v.get_shape().element_space(), auto v = auto_vectorize(x);
[&](auto i) { b[i] = v.data()[i]; }); auto b = as_vec(tensor_vec_size(v), buffer + offset);
return x.with(buffer + offset); idx.local_stride(v.get_shape().element_space(),
[&](auto i) { b[i] = v.data()[i]; });
return x.with(buffer + offset);
}
else
{
auto b = as_vec(tensor_vec_size(x), buffer + offset);
idx.local_stride(x.get_shape().element_space(),
[&](auto i) { b[i] = x.data()[i]; });
return x.with(b);
}
} }
else else
{ {
...@@ -72,23 +97,23 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs) ...@@ -72,23 +97,23 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
invoke); invoke);
} }
template <class T> template <class T, class Shape>
struct remove_vec struct shape_type : Shape
{ {
using type = T; using type = T;
}; };
template <class T, index_int N> template <class T>
struct remove_vec<vec<T, N>> constexpr auto make_shape_type(T)
{ {
using type = T; return shape_type<typename T::type, typename T::shape_type>{};
}; }
template <class T, class... Ts> template <class T, class... Ts>
__device__ auto preload(index idx, Ts... xs) __device__ auto preload(index idx, Ts... xs)
{ {
using type = typename remove_vec<T>::type; using type = remove_vec<T>;
constexpr auto size = compute_preload_size<type>(xs.get_shape()...); constexpr auto size = decltype(compute_preload_size<type>(make_shape_type(xs)...)){};
const index_int max_size = 512 * sizeof(type); const index_int max_size = 512 * sizeof(type);
return [=](auto f) { return [=](auto f) {
if constexpr(size > 0 and size < max_size) if constexpr(size > 0 and size < max_size)
...@@ -103,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs) ...@@ -103,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
}; };
} }
inline __device__ auto auto_preload(index idx)
{
return make_transform([=](auto f, auto out, auto... xs) {
preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
});
}
template <bool B, class T>
__device__ auto preload_copy(index idx, T x)
{
return [=](auto f) {
if constexpr(B)
{
using type = typename T::type;
constexpr auto size = get_shape_c<T>{}.element_space();
__shared__ type buffer[size];
// TODO: Always vecotrize when size > 4, and then use a second loop for remainder
constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
auto input = as_vec<n>(remove_bool(x.data()));
auto b = as_vec<n>(remove_bool(buffer));
idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
return f(x.with(buffer));
}
else
{
return f(x);
}
};
}
template <bool... Bs>
__device__ auto auto_preload(index idx)
{
return make_transform([=](auto f, auto... xs) {
auto invoke = [=](auto... ys) {
__syncthreads();
f(ys...);
};
join(invoke, preload_copy<Bs>(idx, xs)...);
});
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP #endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP #ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP #define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#include <hip/hip_runtime.h> #include <migraphx/kernels/hip.hpp>
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp> #include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/algorithm.hpp> #include <migraphx/kernels/algorithm.hpp>
...@@ -140,6 +140,10 @@ struct basic_printer ...@@ -140,6 +140,10 @@ struct basic_printer
{ {
return print_ulong(value); return print_ulong(value);
} }
__host__ __device__ const basic_printer& operator<<(migraphx::half value) const
{
return print_double(value);
}
__host__ __device__ const basic_printer& operator<<(float value) const __host__ __device__ const basic_printer& operator<<(float value) const
{ {
return print_double(value); return print_double(value);
......
#ifndef MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
#define MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
#include <migraphx/kernels/dpp.hpp>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/ops.hpp>
namespace migraphx {
#if MIGRAPHX_HAS_DPP
template <class T, class Op>
__device__ void dpp_reduce(T& in, Op op)
{
T out{};
out = dpp_mov<dpp_row_shr(1)>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(2)>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
in = op(in, out);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
in = op(in, out);
out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
in = op(in, out);
#endif
}
#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) x = 1
#elif __AMDGCN_WAVEFRONT_SIZE == 64
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) \
__asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_bcast:15 row_mask:0xa\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_bcast:31 row_mask:0xc\n" \
"s_nop 1\n" \
: "=v"(x) \
: "0"(x))
#else
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) \
__asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
"s_nop 1\n" \
"s_nop 1\n" \
: "=v"(x) \
: "0"(x))
#endif
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE(op, prefix) \
__device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
__device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); } \
__device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); } \
__device__ inline void dpp_reduce(int32_t& x, op) \
{ \
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); \
} \
__device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
MIGRAPHX_DPP_REDUCE(op::sum, v_add)
MIGRAPHX_DPP_REDUCE(op::max, v_max)
MIGRAPHX_DPP_REDUCE(op::min, v_min)
MIGRAPHX_DPP_REDUCE(op::product, v_mul)
template <class Op, class T, class F>
__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
{
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr index_int lanes_per_thread = 16;
#else
constexpr index_int lanes_per_thread = 64;
#endif
using type = decltype(f(0));
__shared__ type buffer[idx.nlocal() / lanes_per_thread];
type x = init;
idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
dpp_reduce(x, op);
const auto ldsidx = idx.local / lanes_per_thread;
if((idx.local % lanes_per_thread) == lanes_per_thread - 1)
{
buffer[ldsidx] = x;
}
__syncthreads();
type y = init;
for(index_int i = 0; i < idx.nlocal() / lanes_per_thread; i++)
{
y = op(y, buffer[i]);
}
return y;
}
#else
template <class Op, class T, class F>
__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
{
using type = decltype(f(0));
__shared__ type buffer[idx.nlocal()];
type x = init;
idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
buffer[idx.local] = x;
__syncthreads();
for(index_int s = 1; s < idx.nlocal(); s *= 2)
{
const index_int index = 2 * s * idx.local;
if(index + s < idx.nlocal())
{
buffer[index] = op(buffer[index], buffer[index + s]);
}
__syncthreads();
}
return buffer[0];
}
#endif
template <class Output, class Input, class T>
constexpr auto reduce_slice(Input input, T i)
{
constexpr auto lens = transform(get_shape_c<Input>{}.lens,
get_shape_c<Output>{}.lens,
[](index_int x, index_int y) -> index_int {
if(x == y)
return 1;
return x;
});
;
constexpr auto s = make_shape(lens, get_shape_c<Input>{}.strides);
MIGRAPHX_ASSERT((input.get_shape().index(i) + s.element_space()) <=
input.get_shape().element_space());
return make_tensor_view(&input[i], s);
}
namespace reduce {
template <class Slicer, class F>
constexpr auto sliced(Slicer slicer, F f)
{
return [=](auto x, auto... xs) {
// TODO: assert all elements are the same
return f(slicer(x), slicer(xs)...);
};
}
struct block
{
template <class Slicer>
struct reducer
{
index idx;
Slicer slicer;
template <class Op, class T, class Read>
__device__ auto reduce(Op op, T init, Read read) const
{
return sliced(slicer, [=](auto x, auto... xs) {
return vec_reduce(block_reduce(idx,
op,
init,
x.get_shape().elements(),
[&](auto j) { return read(x[j], xs[j]...); }),
op);
});
}
template <class F>
__device__ void outer(F f) const
{
if(idx.local == 0)
f();
}
};
template <class Slicer>
static __device__ auto make(index idx, Slicer slicer)
{
return reducer<Slicer>{idx, slicer};
}
template <class Output, class F>
static __device__ void run(F f)
{
auto idx = make_index();
constexpr auto nelements = get_shape_c<Output>{}.elements();
idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
});
}
};
struct lane
{
template <class Slicer>
struct reducer
{
index idx;
Slicer slicer;
template <class Op, class T, class Read>
__device__ auto reduce(Op op, T init, Read read) const
{
return sliced(slicer, [=](auto x, auto... xs) {
using type = typename decltype(x)::type;
type r = init;
for(index_int j = 0; j < x.get_shape().elements(); j++)
{
r = op(r, read(x[j], xs[j]...));
}
return r;
});
}
template <class F>
__device__ void outer(F f) const
{
f();
}
};
template <class Slicer>
static __device__ auto make(index idx, Slicer slicer)
{
return reducer<Slicer>{idx, slicer};
}
template <class Output, class F>
static __device__ void run(F f)
{
auto idx = make_index();
constexpr auto nelements = get_shape_c<Output>{}.elements();
idx.global_stride(nelements, [&](auto i) {
const auto out_idx = get_shape_c<Output>{}.multi(i);
f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
});
}
};
} // namespace reduce
template <class Algo,
class Op,
class T,
class Input,
class Output,
class ReadInput,
class WriteOuput>
__device__ void
simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOuput write)
{
Algo::template run<Output>([&](auto out_idx, auto r) {
auto x = r.reduce(op, init, read)(input);
r.outer([&] { output[out_idx] = write(x); });
});
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/dfor.hpp>
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/math.hpp>
#include <migraphx/kernels/array.hpp>
namespace migraphx {
struct max_pool
{
MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest{}; }
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
{
return max(x, y);
}
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int)
{
return (x);
}
};
struct avg_pool
{
MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
{
return x + y;
}
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
{
return (y == 0) ? 0.0 : (x / y);
}
};
template <class Iterator, class Op>
MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
{
array<int, 2> low{};
array<int, 2> high{};
for(index_int ii = 0; ii < xy.size(); ++ii)
{
if(xy[ii] < -1.0f or xy[ii] > dims[ii])
{
return 0;
}
xy[ii] = migraphx::max(xy[ii], 0.0f);
low[ii] = xy[ii];
high[ii] = low[ii] + 1;
if(low[ii] >= dims[ii] - 1)
{
xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
}
}
array<index_int, 4> locs = {low[0] * dims[1] + low[1],
low[0] * dims[1] + high[1],
high[0] * dims[1] + low[1],
high[0] * dims[1] + high[1]};
float ly = xy[0] - low[0];
float lx = xy[1] - low[1];
float hy = 1.0f - ly;
float hx = 1.0f - lx;
array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
return pooling(v01, v23);
}
template <class Iterator, class Op>
MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
const array<float, 2>& roi_starts,
const array<float, 2>& bin_size,
const array<int, 2>& idx,
const array<index_int, 2>& bin_grid_size,
const array<index_int, 2>& dims,
float roi_offset,
Op op)
{
typename Iterator::value_type output_val = op.init();
const int64_t count = bin_grid_size[0] * bin_grid_size[1];
dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
array<index_int, 2> id = {iy, ix};
array<float, 2> locs =
roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
auto val = bilinear_interpolate(data, dims, locs, op);
output_val = op(output_val, val);
});
return op.final(output_val, count);
}
template <class T1, class T2, class T3, class T4>
struct roalign_settings
{
T1 roi_offset{};
T2 is_avg_pooling{};
T3 sampling_ratio{};
T4 spatial_scale{};
};
template <class... Ts>
constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
{
return {xs...};
}
template <class T, class U, class V, class W, class Settings>
__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
{
auto index = make_index();
const auto x = x_t.begin();
const auto rois = rois_t.begin();
const auto ind = ind_t.begin();
// input shape
auto x_lens = x_t.get_shape().lens;
auto channel_num = x_lens[1];
// input dims of height and width, in all 2-dim arrays, the first dim
// is for height and second dim is for width
array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};
const auto stride = index.nglobal();
auto out_s = y_t.get_shape();
auto roi_column_num = rois_t.get_shape().lens[1];
// output dims of height and width, in all 2-dim arrays, the first dim
// is for height and second dim is for width
const auto& out_lens = out_s.lens;
array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
for(index_int i = index.global; i < out_s.elements(); i += stride)
{
auto idx = out_s.multi(i);
int n = idx[0];
int c = idx[1];
int ph = idx[2];
int pw = idx[3];
const auto offset_rois = rois + (n * roi_column_num);
const int batch_ind = ind[n];
array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
offset_rois[0] * s.spatial_scale};
array<float, 2> roi_ends = {offset_rois[3] * s.spatial_scale,
offset_rois[2] * s.spatial_scale};
array<float, 2> roi_size{};
array<float, 2> bin_size{};
array<index_int, 2> bin_grid_size{};
for(index_int ii = 0; ii < roi_size.size(); ++ii)
{
roi_size[ii] = roi_ends[ii] - roi_starts[ii];
roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
bin_size[ii] = roi_size[ii] / out_dims[ii];
bin_grid_size[ii] = (s.sampling_ratio > 0)
? s.sampling_ratio
: migraphx::ceil(roi_size[ii] / out_dims[ii]);
}
const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
if constexpr(s.is_avg_pooling)
{
y_t[i] = calc_pooling(offset_x,
roi_starts,
bin_size,
{ph, pw},
bin_grid_size,
in_dims,
s.roi_offset,
avg_pool{});
}
else
{
y_t[i] = calc_pooling(offset_x,
roi_starts,
bin_size,
{ph, pw},
bin_grid_size,
in_dims,
s.roi_offset,
max_pool{});
}
}
}
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
#define MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/algorithm.hpp>
namespace migraphx {
struct assign_none
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
{
x = y;
}
};
struct assign_add
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
{
x += y;
}
};
struct assign_mul
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
{
x *= y;
}
};
template <class T, class U, class V, class F>
__device__ void scatternd(const T& indices_t, const U& updates_t, const V& output_t, F f)
{
auto index = make_index();
auto updates_shape = updates_t.get_shape();
index.global_stride(updates_shape.elements(), [&](auto i) {
auto output_shape = output_t.get_shape();
auto indices_shape = indices_t.get_shape();
auto k = indices_shape.lens.back();
auto q = indices_shape.lens.size();
auto updates_idx = updates_shape.multi(i);
auto indices_idx = indices_shape.multi(0);
copy(updates_idx.begin(), updates_idx.begin() + q - 1, indices_idx.begin());
auto index_start = indices_t.begin() + indices_shape.index(indices_idx);
auto index_end = index_start + k;
auto out_idx = output_shape.multi(0);
copy(index_start, index_end, out_idx.begin());
copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
f(output_t[out_idx], updates_t[i]);
});
}
} // namespace migraphx
#endif
...@@ -17,35 +17,38 @@ struct shape ...@@ -17,35 +17,38 @@ struct shape
constexpr shape(Lens l, Strides s) : lens(l), strides(s) {} constexpr shape(Lens l, Strides s) : lens(l), strides(s) {}
constexpr index_int elements() const { return lens.product(); } constexpr auto elements() const { return _c<Lens{}.product()>; }
constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; } constexpr auto element_space() const { return _c<Strides{}.dot(Lens{} - 1) + 1>; }
constexpr bool packed() const { return elements() == element_space(); } constexpr auto packed() const { return elements() == element_space(); }
constexpr bool broadcasted() const { return strides.product() == 0; } constexpr auto broadcasted() const { return _c<Strides{}.product() == 0>; }
constexpr bool transposed() const constexpr auto transposed() const
{ {
if(broadcasted()) return return_c([] {
{ auto lstrides = Strides{};
index_array s; if(shape{}.broadcasted())
index_int j = 0;
for(index_int i = 0; i < s.size(); i++)
{ {
if(strides[i] != 0) index_array s{};
index_int j = 0;
for(index_int i = 0; i < s.size(); i++)
{ {
s[j] = strides[i]; if(lstrides[i] != 0)
j++; {
s[j] = lstrides[i];
j++;
}
} }
return not is_sorted(s.begin(), s.begin() + j, greater{});
} }
return not is_sorted(s.begin(), s.begin() + j, greater{}); else
} {
else return not is_sorted(lstrides.begin(), lstrides.end(), greater{});
{ }
return not is_sorted(strides.begin(), strides.end(), greater{}); });
}
} }
constexpr bool standard() const { return packed() and not transposed(); } constexpr auto standard() const { return packed() and not transposed(); }
constexpr index_int index(index_array x) const { return x.dot(strides); } constexpr index_int index(index_array x) const { return x.dot(strides); }
...@@ -63,10 +66,10 @@ struct shape ...@@ -63,10 +66,10 @@ struct shape
return i; return i;
else else
{ {
const index_int rank = this->lens.size(); const auto rank = this->lens.size();
index_int s = 1; index_int s = 1;
index_int result = 0; index_int result = 0;
for(index_int j = 0; j < this->lens.size(); j++) for(index_int j = 0; j < rank; j++)
{ {
const index_int k = rank - j - 1; const index_int k = rank - j - 1;
const index_int stride = this->strides[k]; const index_int stride = this->strides[k];
...@@ -80,11 +83,12 @@ struct shape ...@@ -80,11 +83,12 @@ struct shape
} }
} }
/// Convert single index into a multi-index
constexpr index_array multi(index_int idx) const constexpr index_array multi(index_int idx) const
{ {
index_array result; index_array result;
index_int tidx = idx; index_int tidx = idx;
for(std::ptrdiff_t is = result.size() - 1; is > 0; is--) for(diff_int is = result.size() - 1; is > 0; is--)
{ {
result[is] = tidx % lens[is]; result[is] = tidx % lens[is];
tidx = tidx / lens[is]; tidx = tidx / lens[is];
...@@ -92,6 +96,13 @@ struct shape ...@@ -92,6 +96,13 @@ struct shape
result[0] = tidx; result[0] = tidx;
return result; return result;
} }
/// Convert multi-index into a single index
constexpr index_int single(index_array idx) const
{
if(idx.empty())
return 0;
return inner_product(lens.begin() + 1, lens.end(), idx.begin(), idx.back());
}
constexpr shape get_shape() const { return *this; } constexpr shape get_shape() const { return *this; }
......
...@@ -3,28 +3,62 @@ ...@@ -3,28 +3,62 @@
#include <migraphx/kernels/shape.hpp> #include <migraphx/kernels/shape.hpp>
#include <migraphx/kernels/debug.hpp> #include <migraphx/kernels/debug.hpp>
#include <migraphx/kernels/iota_iterator.hpp>
namespace migraphx { namespace migraphx {
template <class T>
struct tensor_view_iterator_read
{
T* view;
constexpr auto& operator()(index_int n) const
{
MIGRAPHX_ASSERT(view != nullptr);
return (*view)[n];
}
};
template <class T, class Shape> template <class T, class Shape>
struct tensor_view struct tensor_view
{ {
using type = T; using type = T;
using shape_type = Shape;
using index_array = typename Shape::index_array;
using iterator = basic_iota_iterator<tensor_view_iterator_read<const tensor_view>, index_int>;
constexpr Shape get_shape() const { return Shape{}; } constexpr Shape get_shape() const { return Shape{}; }
constexpr index_int size() const { return get_shape().elements(); } constexpr auto size() const { return get_shape().elements(); }
template <class U> struct index_to_offset
constexpr T& operator[](U i) const
{ {
MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space()); index_int offset;
return x[get_shape().index(i)]; template <class U>
constexpr index_to_offset(U i) : offset(Shape{}.index(i))
{
}
};
constexpr T& operator[](MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_to_offset) i) const
{
index_to_offset ito = i;
MIGRAPHX_WARN(ito.offset < get_shape().element_space(),
i,
"Out of bounds access at offset: ",
ito.offset);
return x[ito.offset];
} }
constexpr T* data() const { return x; } constexpr T* data() const { return x; }
constexpr T* begin() const { return data(); } constexpr auto begin() const { return iterator{0, {this}}; }
constexpr T* end() const { return data() + size(); } constexpr auto end() const { return iterator{this->size(), {this}}; }
constexpr auto begin_at(index_array i) const
{
MIGRAPHX_ASSERT(get_shape().single(i) < get_shape().elements());
MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
return iterator{get_shape().single(i), {this}};
}
template <class U> template <class U>
constexpr tensor_view<U, Shape> with(U* y) const constexpr tensor_view<U, Shape> with(U* y) const
...@@ -36,6 +70,9 @@ struct tensor_view ...@@ -36,6 +70,9 @@ struct tensor_view
T* x; T* x;
}; };
template <class T>
using get_shape_c = typename T::shape_type;
template <class T, class Shape> template <class T, class Shape>
constexpr tensor_view<T, Shape> make_tensor_view(T* x, Shape) constexpr tensor_view<T, Shape> make_tensor_view(T* x, Shape)
{ {
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
namespace migraphx {
template <class T, class U = T&&>
U private_declval(int);
template <class T>
T private_declval(long);
template <class T>
auto declval() noexcept -> decltype(private_declval<T>(0));
template <class T>
struct type_identity
{
using type = T;
};
template <bool B, class T = void>
struct enable_if
{
};
template <class T>
struct enable_if<true, T>
{
using type = T;
};
template <bool B, class T = void>
using enable_if_t = typename enable_if<B, T>::type;
template <bool B, class T, class F>
struct conditional
{
using type = T;
};
template <class T, class F>
struct conditional<false, T, F>
{
using type = F;
};
template <bool B, class T, class F>
using conditional_t = typename conditional<B, T, F>::type;
// NOLINTNEXTLINE
#define MIGRAPHX_BUILTIN_TYPE_TRAIT1(name) \
template <class T> \
struct name : bool_constant<__##name(T)> \
{ \
}
// NOLINTNEXTLINE
#define MIGRAPHX_BUILTIN_TYPE_TRAIT2(name) \
template <class T, class U> \
struct name : bool_constant<__##name(T, U)> \
{ \
}
// NOLINTNEXTLINE
#define MIGRAPHX_BUILTIN_TYPE_TRAITN(name) \
template <class... Ts> \
struct name : bool_constant<__##name(Ts...)> \
{ \
}
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_arithmetic);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_destructible);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_nothrow_destructible);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pointer);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_scalar);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_signed);
// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_void);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_abstract);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_aggregate);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_array);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_class);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_compound);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_const);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_empty);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_enum);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_final);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_floating_point);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_function);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_fundamental);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_integral);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_literal_type);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_lvalue_reference);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_function_pointer);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_object_pointer);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_pointer);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_object);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pod);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_polymorphic);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_reference);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_rvalue_reference);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_standard_layout);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivial);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_destructible);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_union);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_unsigned);
MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_volatile);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_assignable);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_base_of);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_convertible);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_nothrow_assignable);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_same);
MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_trivially_assignable);
MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
MIGRAPHX_BUILTIN_TYPE_TRAITN(is_nothrow_constructible);
MIGRAPHX_BUILTIN_TYPE_TRAITN(is_trivially_constructible);
template <class T>
struct remove_reference
{
using type = T;
};
template <class T>
struct remove_reference<T&>
{
using type = T;
};
template <class T>
struct remove_reference<T&&>
{
using type = T;
};
template <class T>
using remove_reference_t = typename remove_reference<T>::type;
template <class T>
struct add_pointer : type_identity<typename remove_reference<T>::type*>
{
};
template <class T>
using add_pointer_t = typename add_pointer<T>::type;
template <class... Ts>
struct common_type;
template <class T>
struct common_type<T>
{
using type = T;
};
template <class T, class U>
struct common_type<T, U>
{
using type = decltype(true ? declval<T>() : declval<U>());
};
template <class T, class U, class... Us>
struct common_type<T, U, Us...>
{
using type = typename common_type<typename common_type<T, U>::type, Us...>::type;
};
template <class... Ts>
using common_type_t = typename common_type<Ts...>::type;
constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
template <class T>
constexpr T numeric_max()
{
if constexpr(is_integral<T>{})
{
if constexpr(is_unsigned<T>{})
return int_max(sizeof(T)) * 2;
else
return int_max(sizeof(T));
}
else if constexpr(is_same<T, double>{})
return __DBL_MAX__;
else if constexpr(is_same<T, float>{})
return __FLT_MAX__;
else if constexpr(is_same<T, migraphx::half>{})
return __FLT16_MAX__;
else
return 0;
}
template <class T>
constexpr T numeric_lowest()
{
if constexpr(is_integral<T>{})
{
if constexpr(is_unsigned<T>{})
return 0;
else
return -numeric_max<T>() - 1;
}
else
{
return -numeric_max<T>();
}
}
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
#include <hip/hip_runtime.h> #include <migraphx/kernels/hip.hpp>
namespace migraphx { namespace migraphx {
using index_int = std::uint32_t; using index_int = std::uint32_t;
using diff_int = std::int32_t;
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template <class T, index_int N> template <class T, index_int N>
using vec = T __attribute__((ext_vector_type(N))); using vec = T __attribute__((ext_vector_type(N)));
using half = _Float16;
using half2 = migraphx::vec<half, 2>;
} // namespace migraphx } // namespace migraphx
#endif #endif
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <migraphx/kernels/types.hpp> #include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp> #include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/functional.hpp>
namespace migraphx { namespace migraphx {
...@@ -13,7 +14,7 @@ constexpr auto vec_size(vec<T, N>) ...@@ -13,7 +14,7 @@ constexpr auto vec_size(vec<T, N>)
} }
template <class T> template <class T>
constexpr auto vec_size(T, ...) constexpr auto vec_size(T, ...) // NOLINT
{ {
return index_constant<0>{}; return index_constant<0>{};
} }
...@@ -24,14 +25,140 @@ constexpr auto vec_size() ...@@ -24,14 +25,140 @@ constexpr auto vec_size()
return decltype(vec_size(T{})){}; return decltype(vec_size(T{})){};
} }
template <class... Ts>
constexpr auto is_any_vec()
{
if constexpr(sizeof...(Ts) == 0)
return false_type{};
else
return bool_constant<((vec_size<Ts>() + ...) > 0)>{};
}
template <class T, class I>
constexpr auto vec_at(T x, I i)
{
if constexpr(vec_size<T>() == 0)
return x;
else
{
MIGRAPHX_ASSERT(i < vec_size<T>());
return x[i];
}
}
template <class T>
using vec_type = decltype(vec_at(T{}, 0));
template <class... Ts>
constexpr auto common_vec_size()
{
return fold([](auto x, auto y) {
if constexpr(x > y)
return x;
else
return y;
})(vec_size<Ts>()...);
}
// Bools can not be used as a vector type so convert it to uint8
template <class T>
__device__ __host__ T* remove_bool(T* x)
{
return x;
}
inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
template <index_int N, class T> template <index_int N, class T>
__device__ __host__ auto as_vec(T* x) __device__ __host__ auto as_vec(T* x)
{ {
if constexpr(N == 0) if constexpr(N < 2)
return x; return x;
else else
return reinterpret_cast<vec<T, N>*>(x); return reinterpret_cast<vec<T, N>*>(x);
} }
template <class T, index_int N>
using safe_vec = vec<conditional_t<is_same<T, bool>{}, uint8_t, T>, N>;
template <class... Ts>
constexpr auto vec_transform(Ts... xs)
{
return [=](auto f) {
if constexpr(is_any_vec<Ts...>())
{
using type = decltype(f(vec_at(xs, 0)...));
constexpr auto size = common_vec_size<Ts...>();
safe_vec<type, size> result = {0};
for(int i = 0; i < size; i++)
result[i] = f(vec_at(xs, i)...);
return result;
}
else
{
return f(xs...);
}
};
}
// Return a vector type of N from index i in another larger vector
// N will be 2 for half2 packing
template <index_int N, class T, class I>
constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
{
if constexpr(vec_size<T>() == 0)
return vec<T, N>{x};
else
{
MIGRAPHX_ASSERT((i + N) < vec_size<T>());
vec<vec_type<T>, N> result = {0};
for(int j = 0; j < N; j++)
{
result[j] = x[i + j];
}
return result;
}
}
template <index_int N, class... Ts>
constexpr auto vec_packed_transform(Ts... xs)
{
return [=](auto f) {
if constexpr(is_any_vec<Ts...>())
{
using type = vec_type<decltype(f(vec_packed_at<N>(xs, 0)...))>;
constexpr auto size = common_vec_size<Ts...>();
safe_vec<type, size> result = {0};
for(int i = 0; i < size / N; i++)
{
// Call the function with packed vectors
safe_vec<type, N> r = f(vec_packed_at<N>(xs, i * N)...);
// Copy the packed vectors to the result
for(int j = 0; j < N; j++)
result[i * N + j] = r[j];
}
return result;
}
else
{
return f(xs...);
}
};
}
template <class T, class Op>
constexpr auto vec_reduce(T x, Op op)
{
if constexpr(vec_size<T>() < 2)
return x;
else
{
vec_type<T> result = x[0];
for(int i = 1; i < vec_size<T>(); i++)
result = op(result, x[i]);
return result;
}
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
...@@ -7,59 +7,71 @@ ...@@ -7,59 +7,71 @@
namespace migraphx { namespace migraphx {
template <class T> template <class T>
constexpr auto tensor_vec_size(T) constexpr auto tensor_vec_size()
{ {
return vec_size<typename T::type>(); return vec_size<typename T::type>();
} }
template <index_int N, class Shape> template <class T>
constexpr auto as_vec_shape(Shape s) constexpr auto tensor_vec_size(T)
{ {
auto lens = transform(s.lens, s.strides, [](auto len, auto stride) { return tensor_vec_size<T>();
if(stride == 1) }
return len / N;
else template <index_int N, class Shape, class Axis>
return len; constexpr auto shape_step(Shape s, Axis)
}); {
auto strides = transform(s.strides, [](auto stride) { static_assert(N > 0, "Vector size must be non-zero");
if(stride == 1) return sequence(s.lens.size(), [&](auto... is) {
return stride; auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
return stride / N; constexpr auto axis = Axis::to();
MIGRAPHX_ASSERT(i != 0);
MIGRAPHX_ASSERT(j != axis or i % N == 0);
if(j == axis)
return i / N;
else
return i;
});
auto strides = transform(s.strides, index_ints<is...>{}, [&](auto i, auto j) {
constexpr auto axis = Axis::to();
// If stride of the axis is zero then we dont need to adjust the other strides
if(Shape{}.strides[axis] == 0)
return i;
MIGRAPHX_ASSERT(j == axis or i % N == 0);
if(j == axis)
return i;
else
return i / N;
});
MIGRAPHX_ASSERT(make_shape(lens, strides).elements() * N == s.elements());
MIGRAPHX_ASSERT(strides[Axis{}] == 0 or
make_shape(lens, strides).element_space() * N == s.element_space());
return make_shape(lens, strides);
}); });
MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
return make_shape(lens, strides);
} }
template <index_int N, class T> template <index_int N, class T, class Axis>
__device__ __host__ auto as_vec(T x) __device__ __host__ auto as_vec(T x, Axis axis)
{ {
if constexpr(N == 0) if constexpr(N < 2)
return x; return x;
else else
return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape())); return make_tensor_view(as_vec<N>(remove_bool(x.data())),
shape_step<N>(x.get_shape(), axis));
} }
template <index_int N, class T, class Axis> template <index_int N, class T, class Axis>
constexpr auto tensor_step(T x, Axis) constexpr auto tensor_step(T x, Axis axis)
{ {
if constexpr(N == 0) if constexpr(N < 2)
{ {
return x; return x;
} }
else else
{ {
constexpr auto s = decltype(x.get_shape()){}; constexpr auto s = decltype(x.get_shape()){};
MIGRAPHX_ASSERT(s.strides[Axis{}] == 0); MIGRAPHX_ASSERT(s.strides[axis] == 0);
return sequence(x.get_shape().lens.size(), [&](auto... is) { return make_tensor_view(x.data(), shape_step<N>(s, axis));
auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
constexpr auto axis = Axis{};
if(j == axis)
return i / N;
else
return i;
});
return make_tensor_view(x.data(), make_shape(lens, s.strides));
});
} }
} }
...@@ -69,55 +81,85 @@ __device__ __host__ auto as_vec(IntegralConstant ic, T&& x) ...@@ -69,55 +81,85 @@ __device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
return as_vec<ic>(x); return as_vec<ic>(x);
} }
template <class... Shapes> template <class Shape>
constexpr index_int find_vector_axis(Shapes... ss) constexpr index_int find_vector_axis_c(Shape s)
{ {
// Find the fastest axis that is not broadcasted
index_int axis = 0; index_int axis = 0;
bool b = false; for(index_int i = 1; i < s.lens.size(); i++)
{
if(s.strides[i] == 0)
continue;
if(s.strides[axis] == 0 or
pack_compare(less{}, pack(s.strides[i], s.lens[i]), pack(s.strides[axis], s.lens[axis])))
axis = i;
}
return axis;
}
template <class... Shapes>
constexpr index_int find_vector_axis_c(Shapes... ss)
{
const bool all_broadcasted = (ss.broadcasted() and ...);
index_int axis = 0;
bool b = false;
by([&](auto s) { by([&](auto s) {
if(s.broadcasted() or b) if(b)
return; return;
auto it = find(s.strides.begin(), s.strides.end(), 1); // Skip broadcasted shapes if there are shapes not broadcasted
if(it == s.strides.end()) if(not all_broadcasted and s.broadcasted())
return; return;
axis = it - s.strides.begin(); axis = find_vector_axis_c(s);
b = true; if(s.strides[axis] == 1)
b = true;
})(ss...); })(ss...);
if(not b)
return -1;
return axis; return axis;
} }
template <class... Shapes>
constexpr auto find_vector_axis(Shapes...)
{
return _c<find_vector_axis_c(Shapes{}...)>;
}
template <index_int N, class Axis, class... Shapes> template <index_int N, class Axis, class... Shapes>
constexpr auto is_vectorizable(Axis axis, Shapes... ss) constexpr auto is_vectorizable_c(Axis axis, Shapes... ss)
{ {
return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and return ((axis < ss.lens.size() and ss.lens[axis] % N == 0 and
// Only vectorize broadcasted types with stride 0, since this causes issues in the
// preloader
((not ss.broadcasted() and ss.strides[axis] == 1) or ss.strides[axis] == 0)) and
...); ...);
} }
template <index_int N, class... Shapes> template <index_int N, class Axis, class... Shapes>
constexpr bool is_vectorizable(Shapes... ss) constexpr auto is_vectorizable(Axis, Shapes...)
{ {
return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...); return _c<is_vectorizable_c<N>(Axis::to(), Shapes{}...)>;
} }
template <class P> template <class P>
constexpr auto find_vectorize_size(P pred) constexpr auto find_vectorize_size(P pred)
{ {
if constexpr(pred(_c<4>)) if constexpr(decltype(pred(_c<4>)){})
return _c<4>; return _c<4>;
else if constexpr(pred(_c<2>)) else if constexpr(decltype(pred(_c<2>)){})
return _c<2>; return _c<2>;
else else
return _c<0>; return _c<1>;
} }
template <class T> template <class T>
__host__ __device__ auto vectorize(T x) __host__ __device__ auto auto_vectorize(T x)
{ {
if constexpr(vec_size<T>() == 0) if constexpr(tensor_vec_size<T>() == 0)
{ {
constexpr auto axis = find_vector_axis(x.get_shape());
constexpr auto n = constexpr auto n =
find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; }); find_vectorize_size([&](auto i) { return is_vectorizable<i>(axis, x.get_shape()); });
return as_vec<n>(x); return as_vec<n>(x, axis);
} }
else else
{ {
...@@ -125,34 +167,73 @@ __host__ __device__ auto vectorize(T x) ...@@ -125,34 +167,73 @@ __host__ __device__ auto vectorize(T x)
} }
} }
template <class F, class... Ts>
inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
{
// TODO: Just check there a single axis of 1
constexpr bool packed_or_broadcasted =
((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
if constexpr(packed_or_broadcasted)
{
constexpr auto axis = decltype(find_vector_axis(xs.get_shape()...)){};
constexpr auto n = find_vectorize_size(
[&](auto i) { return is_vectorizable<i>(axis, xs.get_shape()...); });
by(
[&](auto x) {
constexpr auto s = decltype(x.get_shape()){};
if constexpr(axis < s.strides.size())
{
MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
MIGRAPHX_ASSERT(s.lens[axis] > 0);
MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
if constexpr(s.strides[axis] == 0)
return tensor_step<n>(x, axis);
else
return as_vec<n>(x, axis);
}
else
{
return x;
}
},
f)(xs...);
}
else
{
f(xs...);
}
}
inline __device__ __host__ auto auto_vectorize() inline __device__ __host__ auto auto_vectorize()
{ {
return [](auto... xs) { return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
return [=](auto f) { }
// TODO: Just check there a single axis of 1
constexpr bool packed_or_broadcasted = template <index_int N, index_int Axis, class T>
((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...); __device__ __host__ auto vectorize_tensor(T x)
if constexpr(packed_or_broadcasted) {
{ constexpr auto shape = get_shape_c<T>{};
constexpr auto axis = find_vector_axis(xs.get_shape()...); if constexpr(shape.lens[Axis] == 1)
constexpr auto n = find_vectorize_size( return x;
[&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; }); else if constexpr(shape.strides[Axis] == 0)
by( return tensor_step<N>(x, _c<Axis>);
[&](auto x) { else
constexpr auto s = x.get_shape(); return as_vec<N>(x, _c<Axis>);
if constexpr(s.strides[axis] == 0) }
return tensor_step<n>(x, axis);
else template <index_int N, index_int Axis>
return as_vec<n>(x); __device__ __host__ auto vectorize()
}, {
f)(xs...); return make_transform([](auto f, auto... xs) {
} if constexpr(N < 2)
else {
{ f(xs...);
f(xs...); }
} else
}; {
}; f(vectorize_tensor<N, Axis>(xs)...);
}
});
} }
} // namespace migraphx } // namespace migraphx
......
#include <migraphx/run_loop.hpp>
#include <migraphx/gpu/loop.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/fill.hpp>
#include <unordered_map>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
{
auto input_num = (inputs.size() - 2) / 2;
inputs.erase(inputs.begin() + input_num, inputs.end());
return op.compute_shape(inputs, std::move(mods));
}
struct gpu_loop
{
int64_t max_iterations = 0;
template <class T>
void copy(context& ctx, const argument& src, T& dst) const
{
argument arg_dst{src.get_shape(), &dst};
copy_from_gpu(ctx, src, arg_dst);
}
template <class T>
void copy(context& ctx, T src, const argument& dst) const
{
argument arg_src{dst.get_shape(), &src};
copy_to_gpu(ctx, arg_src, dst);
}
void append(const std::vector<argument>&, const std::vector<argument>&, int) const {}
void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
{
if(iter >= max_iterations)
return;
auto elem_num = max_iterations - iter;
for(const auto& out : concatenated_outputs)
{
auto s = out.get_shape();
auto size = s.bytes() / max_iterations;
auto lens = s.lens();
lens[0] = elem_num;
shape ss{s.type(), lens};
assert(ss.bytes() + iter * size <= out.get_shape().bytes());
device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
}
}
std::unordered_map<std::string, int> get_output_params(const module& m) const
{
auto get_output_index = [](const std::string& name) {
std::string out_prefix = "#output_";
auto loc = name.find(out_prefix);
if(loc != std::string::npos)
{
int index = std::stoi(name.substr(loc + out_prefix.size()));
return index;
}
return -1;
};
const auto& param_names = m.get_parameter_names();
std::unordered_map<std::string, int> result;
for(const auto& name : param_names)
{
auto index = get_output_index(name);
if(index == -1)
continue;
result[name] = index;
}
return result;
}
};
argument
hip_loop::compute(context& ctx,
const shape&,
const std::vector<argument>& args,
const std::vector<module_ref>& mods,
const std::function<std::vector<argument>(
module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
{
return run_loop(gpu_loop{op.max_iterations}, ctx, args, mods, run);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <iterator>
#include <migraphx/gpu/lowering.hpp> #include <migraphx/gpu/lowering.hpp>
#include <migraphx/manage_ptr.hpp> #include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp> #include <migraphx/instruction.hpp>
...@@ -22,6 +23,7 @@ ...@@ -22,6 +23,7 @@
#include <migraphx/gpu/context.hpp> #include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/convolution.hpp> #include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/deconvolution.hpp> #include <migraphx/gpu/deconvolution.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/elu.hpp> #include <migraphx/gpu/elu.hpp>
#include <migraphx/gpu/equal.hpp> #include <migraphx/gpu/equal.hpp>
#include <migraphx/gpu/gemm.hpp> #include <migraphx/gpu/gemm.hpp>
...@@ -37,6 +39,8 @@ ...@@ -37,6 +39,8 @@
#include <migraphx/gpu/quant_convolution.hpp> #include <migraphx/gpu/quant_convolution.hpp>
#include <migraphx/gpu/rocblas.hpp> #include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/unary_not.hpp> #include <migraphx/gpu/unary_not.hpp>
#include <migraphx/gpu/where.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/iterator_for.hpp> #include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp> #include <migraphx/program.hpp>
#include <utility> #include <utility>
...@@ -57,6 +61,7 @@ struct miopen_apply ...@@ -57,6 +61,7 @@ struct miopen_apply
std::unordered_map<instruction_ref, std::string> prog_output_names{}; std::unordered_map<instruction_ref, std::string> prog_output_names{};
bool offload_copy = false; bool offload_copy = false;
bool int8_x4_format = true; bool int8_x4_format = true;
bool compute_fp32 = false;
context& get_context() const context& get_context() const
{ {
...@@ -93,13 +98,22 @@ struct miopen_apply ...@@ -93,13 +98,22 @@ struct miopen_apply
} }
} }
const std::unordered_set<std::string>& get_rocblas_fp32_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
return supported_archs;
}
void init() void init()
{ {
assert(mod != nullptr); assert(mod != nullptr);
assert(pass != nullptr); assert(pass != nullptr);
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38 #if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
auto& ctx = get_context(); auto& ctx = get_context();
const auto device_name = trim(split_string(get_device_name(), ':').front());
if(contains(get_rocblas_fp32_archs(), device_name))
compute_fp32 = true;
rocblas_gemm_flags flag; rocblas_gemm_flags flag;
rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag); rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4); int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
...@@ -149,6 +163,7 @@ struct miopen_apply ...@@ -149,6 +163,7 @@ struct miopen_apply
add_generic_op("sub"); add_generic_op("sub");
add_generic_op("tan"); add_generic_op("tan");
add_generic_op("tanh"); add_generic_op("tanh");
add_generic_op("where");
add_extend_op("abs"); add_extend_op("abs");
add_extend_op("argmax"); add_extend_op("argmax");
...@@ -161,29 +176,29 @@ struct miopen_apply ...@@ -161,29 +176,29 @@ struct miopen_apply
add_extend_op("leaky_relu"); add_extend_op("leaky_relu");
add_extend_op("logsoftmax"); add_extend_op("logsoftmax");
add_extend_op("lrn"); add_extend_op("lrn");
add_extend_op("multinomial");
add_extend_op("nonzero");
add_extend_op("pad"); add_extend_op("pad");
add_extend_op("pooling"); add_extend_op("pooling");
add_extend_op("prefix_scan_sum"); add_extend_op("prefix_scan_sum");
add_extend_op("reduce_max");
add_extend_op("reduce_mean");
add_extend_op("reduce_min");
add_extend_op("reduce_prod");
add_extend_op("reduce_sum");
add_extend_op("reverse"); add_extend_op("reverse");
add_extend_op("rnn_var_sl_last_output"); add_extend_op("rnn_var_sl_last_output");
add_extend_op("rnn_var_sl_shift_output"); add_extend_op("rnn_var_sl_shift_output");
add_extend_op("rnn_var_sl_shift_sequence"); add_extend_op("rnn_var_sl_shift_sequence");
add_extend_op("scatter"); add_extend_op("scatter_none");
add_extend_op("softmax"); add_extend_op("softmax");
add_extend_op("topk");
add_gemm_op<op::dot>("dot"); add_batch_norm_inference_op();
add_gemm_op<op::quant_dot>("quant_dot");
add_convolution_op(); add_convolution_op();
add_deconvolution_op(); add_deconvolution_op();
add_quant_convolution_op(); add_gemm_op<op::dot>("dot");
add_batch_norm_inference_op(); add_gemm_op<op::quant_dot>("quant_dot");
add_neg_op();
add_if_op(); add_if_op();
add_loop_op();
add_neg_op();
add_nms_op();
add_quant_convolution_op();
} }
void copy_params() void copy_params()
...@@ -196,6 +211,10 @@ struct miopen_apply ...@@ -196,6 +211,10 @@ struct miopen_apply
if(ins->name() != "@param") if(ins->name() != "@param")
continue; continue;
// parameter no outputs, no need to insert copy to gpu
if(ins->outputs().empty())
continue;
auto pos = std::next(ins); auto pos = std::next(ins);
auto a = insert_allocation(pos, ins->get_shape()); auto a = insert_allocation(pos, ins->get_shape());
auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a); auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
...@@ -233,11 +252,28 @@ struct miopen_apply ...@@ -233,11 +252,28 @@ struct miopen_apply
{ {
check_shape(s, apply_map.at(it->name())(it)); check_shape(s, apply_map.at(it->name())(it));
} }
else if(has_compiler_for(it->name()))
{
check_shape(s, insert_precompile_op(it));
}
} }
copy_params(); copy_params();
} }
instruction_ref insert_precompile_op(instruction_ref ins)
{
auto output = insert_allocation(ins, ins->get_shape());
std::vector<instruction_ref> refs = ins->inputs();
refs.push_back(output);
return mod->replace_instruction(
ins,
make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
refs,
ins->module_inputs());
}
instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "") instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
{ {
// Instruction's output is an input of the ret instruction // Instruction's output is an input of the ret instruction
...@@ -294,17 +330,14 @@ struct miopen_apply ...@@ -294,17 +330,14 @@ struct miopen_apply
}); });
} }
template <class Op> template <typename Op>
void add_gemm_op(std::string name) void add_gemm_op(const std::string& name)
{ {
apply_map.emplace(name, [=](instruction_ref ins) { apply_map.emplace(name, [=](instruction_ref ins) {
auto&& op = any_cast<Op>(ins->get_operator());
auto beta = op.beta;
std::vector<instruction_ref> refs = ins->inputs(); std::vector<instruction_ref> refs = ins->inputs();
if(refs.size() == 2) if(refs.size() == 2)
{ {
auto output = insert_allocation(ins, ins->get_shape()); auto output = insert_allocation(ins, ins->get_shape());
beta = 0;
refs.push_back(output); refs.push_back(output);
} }
else else
...@@ -323,9 +356,8 @@ struct miopen_apply ...@@ -323,9 +356,8 @@ struct miopen_apply
refs.push_back(refs.back()); refs.push_back(refs.back());
} }
} }
return mod->replace_instruction( return mod->replace_instruction(
ins, rocblas_gemm<Op>{Op{op.alpha, beta}, int8_x4_format}, refs); ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
}); });
} }
...@@ -333,8 +365,22 @@ struct miopen_apply ...@@ -333,8 +365,22 @@ struct miopen_apply
{ {
apply_map.emplace("quant_convolution", [=](instruction_ref ins) { apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
auto&& op = any_cast<op::quant_convolution>(ins->get_operator()); auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
auto conv = miopen_quant_convolution{op, make_conv(op)}; shape ws;
auto ws = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs())); miopen_quant_convolution conv;
auto compile_quant_conv_with_format = [&](bool format) {
conv = miopen_quant_convolution{op, format, make_conv(op)};
ws = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
};
try
{
compile_quant_conv_with_format(int8_x4_format);
}
catch(migraphx::exception&)
{
// In case no solver supports the default format, retry using the other format.
compile_quant_conv_with_format(!int8_x4_format);
}
auto args = ins->inputs(); auto args = ins->inputs();
auto workspace = insert_allocation(ins, ws, "workspace"); auto workspace = insert_allocation(ins, ws, "workspace");
...@@ -344,6 +390,9 @@ struct miopen_apply ...@@ -344,6 +390,9 @@ struct miopen_apply
}); });
} }
// add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
// the fields Since it doesn't have fields its default constructed
void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); } void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }
void add_generic_op(const std::string& op_name, const std::string& gpu_name) void add_generic_op(const std::string& op_name, const std::string& gpu_name)
...@@ -405,7 +454,6 @@ struct miopen_apply ...@@ -405,7 +454,6 @@ struct miopen_apply
reshapes[2], reshapes[2],
reshapes[3], reshapes[3],
output); output);
}); });
} }
...@@ -422,7 +470,7 @@ struct miopen_apply ...@@ -422,7 +470,7 @@ struct miopen_apply
}); });
} }
// replace the if operator with gpu_if operator // add input and output argument for the if operator
void add_if_op() void add_if_op()
{ {
apply_map.emplace("if", [=](instruction_ref ins) { apply_map.emplace("if", [=](instruction_ref ins) {
...@@ -461,9 +509,68 @@ struct miopen_apply ...@@ -461,9 +509,68 @@ struct miopen_apply
return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args); return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
}); });
} }
// replace the loop operator with gpu_loop operator
void add_loop_op()
{
apply_map.emplace("loop", [=](instruction_ref ins) {
std::vector<instruction_ref> inputs = ins->inputs();
// copy max_iter from gpu to cpu
auto cpu_max_iter =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
auto cpu_cond =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
auto synced_max_iter =
mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
inputs.at(0) = synced_max_iter;
inputs.at(1) = cpu_cond;
auto copy_inputs = inputs;
std::transform(
copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
return mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
});
auto mod_args = ins->module_inputs();
auto output = insert_allocation(ins, ins->get_shape());
const auto* sub_mod = mod_args.front();
auto cond_out = mod->insert_instruction(
ins,
make_op("hip::allocate",
{{"shape", to_value(sub_mod->get_output_shapes().front())}}));
// add cond and mod outputs to the argument list
inputs.push_back(cond_out);
inputs.push_back(output);
return mod->replace_instruction(
ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
});
}
void add_nms_op()
{
apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
auto s = ins->get_shape();
auto output = insert_allocation(ins, s);
std::vector<instruction_ref> cpu_inputs;
auto inputs = ins->inputs();
std::transform(
inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
});
cpu_inputs.front() =
mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
auto gpu_out =
mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
return mod->replace_instruction(ins, gpu_out);
});
}
}; };
void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); } void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
#include <migraphx/gpu/multinomial.hpp>
#include <migraphx/gpu/device/multinomial.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/tune_axis.hpp>
#include <migraphx/check_shapes.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
{
check_shapes{inputs, *this}.has(3).only_dims(2).standard();
inputs.pop_back();
return op.compute_shape(inputs);
}
argument
hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
return args.back();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/nonzero.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/nonzero.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
{
return op.compute_shape({inputs.front()});
}
argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset) ...@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
auto pad_k = (k + 3) / 4 * 4; auto pad_k = (k + 3) / 4 * 4;
auto pad_lens = lens; auto pad_lens = lens;
pad_lens[lens.size() + offset] = pad_k; pad_lens[lens.size() + offset] = pad_k;
std::vector<int64_t> pad_dims(lens.size() * 2, 0); auto ret_ins = ins;
auto ret_ins = ins;
if(pad_k != k) if(pad_k != k)
{ {
std::vector<int64_t> pad_dims(lens.size() * 2, 0);
pad_dims[lens.size() + offset] = pad_k - k; pad_dims[lens.size() + offset] = pad_k - k;
shape ps{s.type(), pad_lens}; shape ps{s.type(), pad_lens};
auto ins_out = auto ins_out =
...@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins) ...@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
auto t_in = in0->inputs().front(); auto t_in = in0->inputs().front();
auto p_in = pad_ins(m, t_in, offset); auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>(); auto dims = val.at("dims").to_vector<int64_t>();
auto r_in = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in); auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in); ret_inputs.push_back(r_in);
} }
else else
...@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins) ...@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
auto t_in = in1->inputs().front(); auto t_in = in1->inputs().front();
auto p_in = pad_ins(m, t_in, offset); auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>(); auto dims = val.at("dims").to_vector<int64_t>();
auto r_in = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in); auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in); ret_inputs.push_back(r_in);
} }
else else
...@@ -116,7 +118,7 @@ void pack_int8_args::apply(module& m) const ...@@ -116,7 +118,7 @@ void pack_int8_args::apply(module& m) const
assert(val.contains("int8_x4_format")); assert(val.contains("int8_x4_format"));
if(not val.at("int8_x4_format").to<bool>()) if(not val.at("int8_x4_format").to<bool>())
{ {
return; continue;
} }
auto inputs = ins->inputs(); auto inputs = ins->inputs();
auto lens = inputs.at(0)->get_shape().lens(); auto lens = inputs.at(0)->get_shape().lens();
...@@ -154,6 +156,12 @@ void pack_int8_args::apply(module& m) const ...@@ -154,6 +156,12 @@ void pack_int8_args::apply(module& m) const
} }
else if(ins->name() == "gpu::quant_convolution") else if(ins->name() == "gpu::quant_convolution")
{ {
auto val = ins->get_operator().to_value();
if(not val.at("int8_x4_format").to<bool>())
{
continue;
}
auto inputs = ins->inputs(); auto inputs = ins->inputs();
auto packed_x = m.insert_instruction( auto packed_x = m.insert_instruction(
ins, ins,
......
#include <migraphx/gpu/prefuse_ops.hpp>
#include <migraphx/match/layernorm.hpp>
#include <migraphx/make_op.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace {
struct find_layernorm
{
auto matcher() const { return match::layernorm(); }
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
if(not x_ins->get_shape().standard())
x_ins = m.insert_instruction(ins, make_op("contiguous"), x_ins);
auto relements = x_ins->get_shape().lens().back();
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
auto a = m.insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
m.replace_instruction(ins, make_op("gpu::layernorm"), x_ins, a);
}
};
struct find_triaddlayernorm
{
auto matcher() const
{
auto add1 =
match::name("add")(match::none_of(match::is_constant()),
match::args(match::any().bind("z1"), match::any().bind("z2")));
auto add2 = match::name("add")(match::either_arg(0, 1)(add1, match::any().bind("z3")));
return match::layernorm()(match::var("x")(add2));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["z1"];
auto y_ins = r.instructions["z2"];
auto z_ins = r.instructions["z3"];
for(auto* pins : {&x_ins, &y_ins, &z_ins})
{
if(not(*pins)->get_shape().standard())
*pins = m.insert_instruction(ins, make_op("contiguous"), *pins);
}
auto relements = x_ins->get_shape().lens().back();
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
auto a = m.insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
m.replace_instruction(ins, make_op("gpu::triadd_layernorm"), x_ins, y_ins, z_ins, a);
}
};
} // namespace
void prefuse_ops::apply(module& m) const
{
match::find_matches(m, find_triaddlayernorm{}, find_layernorm{});
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -16,8 +16,8 @@ argument miopen_quant_convolution::compute(context& ctx, ...@@ -16,8 +16,8 @@ argument miopen_quant_convolution::compute(context& ctx,
const shape& output_shape, const shape& output_shape,
const std::vector<argument>& args) const const std::vector<argument>& args) const
{ {
auto x_desc = make_tensor(args[0].get_shape(), true); auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
auto w_desc = make_tensor(args[1].get_shape(), true); auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
auto y_desc = make_tensor(output_shape); auto y_desc = make_tensor(output_shape);
float alpha = 1; float alpha = 1;
...@@ -49,8 +49,8 @@ shape miopen_quant_convolution::compile(context& ctx, ...@@ -49,8 +49,8 @@ shape miopen_quant_convolution::compile(context& ctx,
std::vector<shape> inputs) std::vector<shape> inputs)
{ {
shape workspace_shape{}; shape workspace_shape{};
auto x_desc = make_tensor(inputs[0], true); auto x_desc = make_tensor(inputs[0], int8_x4_format);
auto w_desc = make_tensor(inputs[1], true); auto w_desc = make_tensor(inputs[1], int8_x4_format);
auto y_desc = make_tensor(output_shape); auto y_desc = make_tensor(output_shape);
std::size_t workspace_size = 0; std::size_t workspace_size = 0;
...@@ -62,8 +62,15 @@ shape miopen_quant_convolution::compile(context& ctx, ...@@ -62,8 +62,15 @@ shape miopen_quant_convolution::compile(context& ctx,
&workspace_size); &workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}}; workspace_shape = shape{shape::int8_type, {workspace_size}};
auto arg_vec4_x = to_gpu(generate_argument(pack_int8_shape(inputs[0]))); auto x_shape = inputs[0];
auto arg_vec4_w = to_gpu(generate_argument(pack_int8_shape(inputs[1]))); auto w_shape = inputs[1];
if(int8_x4_format)
{
x_shape = pack_int8_shape(x_shape);
w_shape = pack_int8_shape(w_shape);
}
auto arg_vec4_x = to_gpu(generate_argument(x_shape));
auto arg_vec4_w = to_gpu(generate_argument(w_shape));
auto y = allocate_gpu(output_shape); auto y = allocate_gpu(output_shape);
auto workspace = allocate_gpu(workspace_shape); auto workspace = allocate_gpu(workspace_shape);
......
...@@ -77,28 +77,28 @@ MIGRAPHX_REGISTER_OP(wait_event) ...@@ -77,28 +77,28 @@ MIGRAPHX_REGISTER_OP(wait_event)
MIGRAPHX_REGISTER_OP(set_stream) MIGRAPHX_REGISTER_OP(set_stream)
std::size_t schedule_model::concurrency() const { return streams; } std::size_t schedule_model::concurrency() const { return streams; }
void schedule_model::sched(module& p, instruction_ref ins, std::size_t n) const void schedule_model::sched(module& m, instruction_ref ins, std::size_t n) const
{ {
auto last_stream = std::find_if(std::make_reverse_iterator(ins), auto last_stream = std::find_if(std::make_reverse_iterator(ins),
std::make_reverse_iterator(p.begin()), std::make_reverse_iterator(m.begin()),
[&](auto&& i) { return i.name() == "gpu::set_stream"; }); [&](auto&& i) { return i.name() == "gpu::set_stream"; });
if(last_stream != std::make_reverse_iterator(p.begin())) if(last_stream != std::make_reverse_iterator(m.begin()))
{ {
auto&& op = any_cast<set_stream>(last_stream->get_operator()); auto&& op = any_cast<set_stream>(last_stream->get_operator());
// If the same stream was set earlier then skip // If the same stream was set earlier then skip
if(op.stream == n) if(op.stream == n)
return; return;
} }
p.insert_instruction(ins, set_stream{n}); m.insert_instruction(ins, set_stream{n});
} }
void schedule_model::wait(module& p, instruction_ref ins, std::size_t wait_id) const void schedule_model::wait(module& m, instruction_ref ins, std::size_t wait_id) const
{ {
p.insert_instruction(ins, wait_event{wait_id}); m.insert_instruction(ins, wait_event{wait_id});
} }
void schedule_model::record(module& p, instruction_ref ins, std::size_t wait_id) const void schedule_model::record(module& m, instruction_ref ins, std::size_t wait_id) const
{ {
p.insert_instruction(std::next(ins), record_event{wait_id}); m.insert_instruction(std::next(ins), record_event{wait_id});
} }
static std::unordered_map<std::string, std::size_t> create_weight_map() static std::unordered_map<std::string, std::size_t> create_weight_map()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment