Commit 7dc6e3ae authored by Khalique Ahmed's avatar Khalique Ahmed
Browse files

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

parents f94d77fc a275f590
...@@ -2,57 +2,26 @@ ...@@ -2,57 +2,26 @@
#define MIGRAPHX_GUARD_KERNELS_ARGS_HPP #define MIGRAPHX_GUARD_KERNELS_ARGS_HPP
#include <migraphx/kernels/types.hpp> #include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/functional.hpp>
namespace migraphx { namespace migraphx {
template <std::size_t N>
struct arg
{
};
template <std::size_t...>
struct seq
{
using type = seq;
};
template <class, class>
struct merge_seq;
template <std::size_t... Xs, std::size_t... Ys>
struct merge_seq<seq<Xs...>, seq<Ys...>> : seq<Xs..., (sizeof...(Xs) + Ys)...>
{
};
template <std::size_t N>
struct gens : merge_seq<typename gens<N / 2>::type, typename gens<N - N / 2>::type>
{
};
template <>
struct gens<0> : seq<>
{
};
template <>
struct gens<1> : seq<0>
{
};
// Use template specialization since ADL is broken on hcc // Use template specialization since ADL is broken on hcc
template <std::size_t> template <index_int>
struct make_tensor; struct make_tensor;
template <class F, std::size_t... Ns, class... Ts> template <class F, index_int... Ns, class... Ts>
__device__ auto make_tensors_impl(F f, seq<Ns...>, Ts*... xs) __device__ auto make_tensors_impl(F f, detail::seq<Ns...>, Ts*... xs)
{ {
f(make_tensor<Ns>::apply(xs)...); return f(make_tensor<Ns>::apply(xs)...);
} }
template <class... Ts> inline __device__ auto make_tensors()
__device__ auto make_tensors(Ts*... xs)
{ {
return [=](auto f) { make_tensors_impl(f, gens<sizeof...(Ts)>{}, xs...); }; return [](auto*... xs) {
return [=](auto f) { return make_tensors_impl(f, detail::gens<sizeof...(xs)>{}, xs...); };
};
} }
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_ARGS_HPP #endif // MIGRAPHX_GUARD_KERNELS_ARGS_HPP
\ No newline at end of file
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
#include <migraphx/kernels/types.hpp> #include <migraphx/kernels/types.hpp>
#include <type_traits> #include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/debug.hpp>
namespace migraphx { namespace migraphx {
...@@ -41,8 +42,16 @@ template <class T, index_int N> ...@@ -41,8 +42,16 @@ template <class T, index_int N>
struct array struct array
{ {
T d[N]; T d[N];
constexpr T& operator[](index_int i) { return d[i]; } constexpr T& operator[](index_int i)
constexpr const T& operator[](index_int i) const { return d[i]; } {
MIGRAPHX_ASSERT(i < N);
return d[i];
}
constexpr const T& operator[](index_int i) const
{
MIGRAPHX_ASSERT(i < N);
return d[i];
}
constexpr T& front() { return d[0]; } constexpr T& front() { return d[0]; }
constexpr const T& front() const { return d[0]; } constexpr const T& front() const { return d[0]; }
...@@ -53,7 +62,7 @@ struct array ...@@ -53,7 +62,7 @@ struct array
constexpr T* data() { return d; } constexpr T* data() { return d; }
constexpr const T* data() const { return d; } constexpr const T* data() const { return d; }
constexpr std::integral_constant<index_int, N> size() const { return {}; } constexpr index_constant<N> size() const { return {}; }
constexpr T* begin() { return d; } constexpr T* begin() { return d; }
constexpr const T* begin() const { return d; } constexpr const T* begin() const { return d; }
...@@ -142,6 +151,18 @@ struct array ...@@ -142,6 +151,18 @@ struct array
result[0] += overflow; result[0] += overflow;
return result; return result;
} }
template <class Stream>
friend constexpr const Stream& operator<<(const Stream& ss, const array& a)
{
for(index_int i = 0; i < N; i++)
{
if(i > 0)
ss << ", ";
ss << a[i];
}
return ss;
}
}; };
template <class T, T... xs> template <class T, T... xs>
...@@ -151,6 +172,18 @@ struct integral_const_array : array<T, sizeof...(xs)> ...@@ -151,6 +172,18 @@ struct integral_const_array : array<T, sizeof...(xs)>
MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({xs...}) {} MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({xs...}) {}
}; };
template <class T, T... xs, class F>
constexpr auto transform(integral_const_array<T, xs...>, F f)
{
return integral_const_array<T, f(xs)...>{};
}
template <class T, T... xs, class U, U... ys, class F>
constexpr auto transform(integral_const_array<T, xs...>, integral_const_array<U, ys...>, F f)
{
return integral_const_array<T, f(xs, ys)...>{};
}
template <index_int... Ns> template <index_int... Ns>
using index_ints = integral_const_array<index_int, Ns...>; using index_ints = integral_const_array<index_int, Ns...>;
......
#ifndef MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
#define MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
#include <hip/hip_runtime.h>
namespace migraphx {
inline __host__ __device__ void
assert_fail(const char* assertion, const char* file, unsigned int line, const char* function)
{
printf("%s:%u: %s: assertion '%s' failed.\n", file, line, function, assertion);
abort();
}
#ifdef MIGRAPHX_DEBUG
#define MIGRAPHX_ASSERT(cond) \
((cond) ? void(0) : [](auto... xs) { \
assert_fail(xs...); \
}(#cond, __FILE__, __LINE__, __PRETTY_FUNCTION__))
#else
#define MIGRAPHX_ASSERT(cond)
#endif
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
#define MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
#include <migraphx/kernels/array.hpp>
namespace migraphx {
struct swallow
{
template <class... Ts>
constexpr swallow(Ts&&...)
{
}
};
template <index_int>
using ignore = swallow;
namespace detail {
template <class R>
struct eval_helper
{
R result;
template <class F, class... Ts>
constexpr eval_helper(const F& f, Ts&&... xs) : result(f(static_cast<Ts>(xs)...))
{
}
};
template <>
struct eval_helper<void>
{
int result;
template <class F, class... Ts>
constexpr eval_helper(const F& f, Ts&&... xs) : result((f(static_cast<Ts>(xs)...), 0))
{
}
};
template <index_int...>
struct seq
{
using type = seq;
};
template <class, class>
struct merge_seq;
template <index_int... Xs, index_int... Ys>
struct merge_seq<seq<Xs...>, seq<Ys...>> : seq<Xs..., (sizeof...(Xs) + Ys)...>
{
};
template <index_int N>
struct gens : merge_seq<typename gens<N / 2>::type, typename gens<N - N / 2>::type>
{
};
template <>
struct gens<0> : seq<>
{
};
template <>
struct gens<1> : seq<0>
{
};
template <class F, index_int... Ns>
constexpr auto sequence_c_impl(F&& f, seq<Ns...>)
{
return f(index_constant<Ns>{}...);
}
template <index_int... N>
constexpr auto args_at(seq<N...>)
{
return [](ignore<N>..., auto x, auto...) { return x; };
}
} // namespace detail
template <class T>
constexpr auto always(T x)
{
return [=](auto&&...) { return x; };
}
template <index_int N, class F>
constexpr auto sequence_c(F&& f)
{
return detail::sequence_c_impl(f, detail::gens<N>{});
}
template <class IntegerConstant, class F>
constexpr auto sequence(IntegerConstant ic, F&& f)
{
return sequence_c<ic>(f);
}
template <class F, class G>
constexpr auto by(F f, G g)
{
return [=](auto... xs) {
return detail::eval_helper<decltype(g(f(xs)...))>{g, f(xs)...}.result;
};
}
template <class F>
constexpr auto by(F f)
{
return by([=](auto x) { return (f(x), 0); }, always(0));
}
template <class F, class... Ts>
constexpr void each_args(F f, Ts&&... xs)
{
swallow{(f(std::forward<Ts>(xs)), 0)...};
}
template <class F>
constexpr void each_args(F)
{
}
template <class... Ts>
auto pack(Ts... xs)
{
return [=](auto f) { return f(xs...); };
}
template <index_int N>
constexpr auto arg_c()
{
return [](auto... xs) { return detail::args_at(detail::gens<N>{})(xs...); };
}
template <class IntegralConstant>
constexpr auto arg(IntegralConstant ic)
{
return arg_c<ic>();
}
inline constexpr auto rotate_last()
{
return [](auto... xs) {
return [=](auto&& f) {
return sequence_c<sizeof...(xs)>([&](auto... is) {
constexpr auto size = sizeof...(is);
return f(arg_c<(is + size - 1) % size>()(xs...)...);
});
};
};
}
template <class F>
constexpr auto transform_args(F f)
{
return [=](auto... xs) {
return [=](auto g) { return f(xs...)([&](auto... ys) { return g(ys...); }); };
};
}
template <class F, class... Fs>
constexpr auto transform_args(F f, Fs... fs)
{
return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
}
#define MIGRAPHX_LIFT(...) \
([](auto&&... xs) { return (__VA_ARGS__)(static_cast<decltype(xs)>(xs)...); })
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
...@@ -12,9 +12,43 @@ struct index ...@@ -12,9 +12,43 @@ struct index
index_int local = 0; index_int local = 0;
index_int group = 0; index_int group = 0;
__device__ index_int nglobal() const { return blockDim.x * gridDim.x; } // NOLINT __device__ index_int nglobal() const
{
#ifdef MIGRAPHX_NGLOBAL
return MIGRAPHX_NGLOBAL;
#else
return blockDim.x * gridDim.x;
#endif
}
__device__ index_int nlocal() const { return blockDim.x; } // NOLINT __device__ index_int nlocal() const
{
#ifdef MIGRAPHX_NLOCAL
return MIGRAPHX_NLOCAL;
#else
return blockDim.x;
#endif
}
template <class F>
__device__ void global_stride(index_int n, F f) const
{
const auto stride = nglobal();
for(index_int i = global; i < n; i += stride)
{
f(i);
}
}
template <class F>
__device__ void local_stride(index_int n, F f) const
{
const auto stride = nlocal();
for(index_int i = local; i < n; i += stride)
{
f(i);
}
}
}; };
inline __device__ index make_index() inline __device__ index make_index()
......
#ifndef MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
#define MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
#include <migraphx/kernels/types.hpp>
namespace migraphx {
template <class T, T v>
struct integral_constant
{
static constexpr T value = v;
using value_type = T;
using type = integral_constant;
constexpr operator value_type() const noexcept { return value; }
constexpr value_type operator()() const noexcept { return value; }
};
#define MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(op) \
template <class T, T v, class U, U w> \
constexpr inline integral_constant<decltype(v op w), (v op w)> operator op( \
integral_constant<T, v>, integral_constant<U, w>) noexcept \
{ \
return {}; \
}
#define MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(op) \
template <class T, T v> \
constexpr inline integral_constant<decltype(op v), (op v)> operator op( \
integral_constant<T, v>) noexcept \
{ \
return {}; \
}
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(+)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(-)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(*)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(/)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(%)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>>)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<<)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<=)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>=)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(==)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(!=)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP (^)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(|)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&&)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(||)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(!)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(~)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(+)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(-)
template <bool B>
using bool_constant = integral_constant<bool, B>;
using true_type = bool_constant<true>;
using false_type = bool_constant<false>;
template <index_int N>
using index_constant = integral_constant<index_int, N>;
template <auto v>
static constexpr auto _c = integral_constant<decltype(v), v>{};
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
#define MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/preload.hpp>
#include <migraphx/kernels/vectorize.hpp>
#include <migraphx/kernels/args.hpp>
namespace migraphx {
template <class F, class T, class... Ts>
__device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
{
preload<typename T::type>(idx, xs...)([&](auto... ps) {
idx.global_stride(out.get_shape().elements(), [&](auto i) {
auto multi_idx = out.get_shape().multi(i);
out[multi_idx] = f(ps[multi_idx]...);
});
});
}
template <class F, class... Ts>
__device__ void pointwise(F f, Ts*... ps)
{
auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
t(ps...)([&](auto... xs) {
auto idx = make_index();
pointwise_tensor(idx, f, xs...);
});
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#define MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
namespace migraphx {
template <class T, class... Shapes>
constexpr auto traverse_preload(Shapes... ss)
{
return [=](auto f, auto... g) {
index_int offset = 0;
auto each = [&](auto x) {
constexpr auto s = decltype(x.get_shape()){};
constexpr auto size = _c<s.element_space()>;
if constexpr(not s.broadcasted())
return f(x, offset, false_type{});
else if constexpr((s.elements() - size) < 64)
return f(x, offset, false_type{});
else
{
auto pre_offset = offset;
offset += size;
offset += offset % 4;
return f(x, pre_offset, true_type{});
}
};
return by(each, g...)(ss...);
};
}
template <class T, class... Shapes>
constexpr index_int compute_preload_size(Shapes...)
{
index_int size = 0;
traverse_preload<T>(Shapes{}...)(
[&](auto s, auto offset, auto) { size = offset + s.element_space(); });
return size;
}
template <class F, class T, class... Ts>
__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
{
auto invoke = [&](auto... ys) {
__syncthreads();
f(ys...);
};
traverse_preload<T>(xs...)(
[&](auto x, auto offset, auto copy) {
if constexpr(copy)
{
auto v = vectorize(x);
auto b = as_vec(tensor_vec_size(v), buffer + offset);
idx.local_stride(v.get_shape().element_space(),
[&](auto i) { b[i] = v.data()[i]; });
return x.with(buffer + offset);
}
else
{
return x;
}
},
invoke);
}
template <class T>
struct remove_vec
{
using type = T;
};
template <class T, index_int N>
struct remove_vec<vec<T, N>>
{
using type = T;
};
template <class T, class... Ts>
__device__ auto preload(index idx, Ts... xs)
{
using type = typename remove_vec<T>::type;
constexpr auto size = compute_preload_size<type>(xs.get_shape()...);
const index_int max_size = 512 * sizeof(type);
return [=](auto f) {
if constexpr(size > 0 and size < max_size)
{
__shared__ type buffer[size];
preload_copy(idx, f, buffer, xs...);
}
else
{
f(xs...);
}
};
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#include <hip/hip_runtime.h>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/algorithm.hpp>
namespace migraphx {
template <class F, class G>
struct on_exit
{
F f;
G g;
template <class T>
__host__ __device__ auto operator()(T x) const
{
return f(x);
}
__host__ __device__ ~on_exit() { f(g); }
};
template <class PrivateMIGraphXTypeNameProbe>
constexpr auto print_type_name_probe()
{
constexpr auto name = __PRETTY_FUNCTION__;
constexpr auto size = sizeof(__PRETTY_FUNCTION__);
constexpr auto parameter_name = "PrivateMIGraphXTypeNameProbe = ";
constexpr auto parameter_name_size = sizeof("PrivateMIGraphXTypeNameProbe = ") - 1;
constexpr auto begin =
search(name, name + size, parameter_name, parameter_name + parameter_name_size);
static_assert(begin < name + size, "Type probe not found.");
constexpr auto start = begin + parameter_name_size;
constexpr auto last = find_if(start, name + size, [](auto c) { return c == ']' or c == ';'; });
return [=](const auto& s) { s.print_string(start, last - start); };
}
template <class T>
struct type_printer
{
template <class Stream>
friend constexpr const Stream& operator<<(const Stream& s, type_printer)
{
print_type_name_probe<T>()(s);
return s;
}
};
template <class T>
constexpr type_printer<T> type_of()
{
return {};
}
template <class T>
constexpr type_printer<T> type_of(T)
{
return {};
}
template <class T>
constexpr type_printer<typename T::type> sub_type_of()
{
return {};
}
template <class T>
constexpr type_printer<typename T::type> sub_type_of(T)
{
return {};
}
template <class F>
struct basic_printer
{
F f;
__host__ __device__ const basic_printer& print_long(long value) const
{
f([&] { printf("%li", value); });
return *this;
}
__host__ __device__ const basic_printer& print_ulong(unsigned long value) const
{
f([&] { printf("%lu", value); });
return *this;
}
__host__ __device__ const basic_printer& print_char(char value) const
{
f([&] { printf("%c", value); });
return *this;
}
__host__ __device__ const basic_printer& print_string(const char* value) const
{
f([&] { printf("%s", value); });
return *this;
}
__host__ __device__ const basic_printer& print_string(const char* value, int size) const
{
f([&] { printf("%.*s", size, value); });
return *this;
}
__host__ __device__ const basic_printer& print_double(double value) const
{
f([&] { printf("%f", value); });
return *this;
}
__host__ __device__ const basic_printer& print_bool(bool value) const
{
f([&] {
if(value)
printf("true");
else
printf("false");
});
return *this;
}
__host__ __device__ const basic_printer& operator<<(short value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned short value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(int value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned int value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(long value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned long value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(float value) const
{
return print_double(value);
}
__host__ __device__ const basic_printer& operator<<(double value) const
{
return print_double(value);
}
__host__ __device__ const basic_printer& operator<<(bool value) const
{
return print_bool(value);
}
__host__ __device__ const basic_printer& operator<<(char value) const
{
return print_char(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned char value) const
{
return print_char(value);
}
__host__ __device__ const basic_printer& operator<<(const char* value) const
{
return print_string(value);
}
};
template <class F>
constexpr basic_printer<F> make_printer(F f)
{
return {f};
}
template <class F, class G>
constexpr basic_printer<on_exit<F, G>> make_printer(F f, G g)
{
return {{f, g}};
}
inline __device__ auto cout()
{
return make_printer([](auto f) { f(); });
}
inline __device__ auto coutln()
{
return make_printer([](auto f) { f(); }, [] { printf("\n"); });
}
template <class F, class... Ts>
__device__ void print_each(F f, Ts... xs)
{
each_args([&](auto x) { f() << x; }, xs...);
}
template <class F, class... Ts>
__device__ void print_each_once(F f, Ts... xs)
{
auto idx = make_index();
if(idx.global == 0)
print_each(f, xs...);
}
template <class... Ts>
__device__ void print(Ts... xs)
{
print_each(&cout, xs...);
}
template <class... Ts>
__device__ void print_once(Ts... xs)
{
print_each_once(&cout, xs...);
}
template <class... Ts>
__device__ void println(Ts... xs)
{
print_each(&coutln, xs...);
}
template <class... Ts>
__device__ void println_once(Ts... xs)
{
print_each_once(&coutln, xs...);
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRINT_HPP
...@@ -19,7 +19,7 @@ struct shape ...@@ -19,7 +19,7 @@ struct shape
constexpr index_int elements() const { return lens.product(); } constexpr index_int elements() const { return lens.product(); }
constexpr index_int element_space() const { return strides.dot(lens - 1); } constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }
constexpr bool packed() const { return elements() == element_space(); } constexpr bool packed() const { return elements() == element_space(); }
constexpr bool broadcasted() const { return strides.product() == 0; } constexpr bool broadcasted() const { return strides.product() == 0; }
...@@ -92,6 +92,15 @@ struct shape ...@@ -92,6 +92,15 @@ struct shape
result[0] = tidx; result[0] = tidx;
return result; return result;
} }
constexpr shape get_shape() const { return *this; }
template <class Stream>
friend constexpr const Stream& operator<<(const Stream& ss, const shape& s)
{
ss << "{" << s.lens << "}, {" << s.strides << "}";
return ss;
}
}; };
template <class Lens, class Strides> template <class Lens, class Strides>
......
...@@ -2,18 +2,22 @@ ...@@ -2,18 +2,22 @@
#define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP #define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
#include <migraphx/kernels/shape.hpp> #include <migraphx/kernels/shape.hpp>
#include <migraphx/kernels/debug.hpp>
namespace migraphx { namespace migraphx {
template <class T, class Shape> template <class T, class Shape>
struct tensor_view struct tensor_view
{ {
using type = T;
constexpr Shape get_shape() const { return Shape{}; } constexpr Shape get_shape() const { return Shape{}; }
constexpr index_int size() const { return get_shape().elements(); } constexpr index_int size() const { return get_shape().elements(); }
template <class U> template <class U>
constexpr T& operator[](U i) const constexpr T& operator[](U i) const
{ {
MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
return x[get_shape().index(i)]; return x[get_shape().index(i)];
} }
...@@ -22,6 +26,13 @@ struct tensor_view ...@@ -22,6 +26,13 @@ struct tensor_view
constexpr T* begin() const { return data(); } constexpr T* begin() const { return data(); }
constexpr T* end() const { return data() + size(); } constexpr T* end() const { return data() + size(); }
template <class U>
constexpr tensor_view<U, Shape> with(U* y) const
{
static_assert(sizeof(T) == sizeof(U), "Not the same size");
return {y};
}
T* x; T* x;
}; };
......
...@@ -9,6 +9,9 @@ using index_int = std::uint32_t; ...@@ -9,6 +9,9 @@ using index_int = std::uint32_t;
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template <class T, index_int N>
using vec = T __attribute__((ext_vector_type(N)));
} // namespace migraphx } // namespace migraphx
#endif #endif
#ifndef MIGRAPHX_GUARD_KERNELS_VEC_HPP
#define MIGRAPHX_GUARD_KERNELS_VEC_HPP
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
namespace migraphx {
template <class T, index_int N>
constexpr auto vec_size(vec<T, N>)
{
return index_constant<N>{};
}
template <class T>
constexpr auto vec_size(T, ...)
{
return index_constant<0>{};
}
template <class T>
constexpr auto vec_size()
{
return decltype(vec_size(T{})){};
}
template <index_int N, class T>
__device__ __host__ auto as_vec(T* x)
{
if constexpr(N == 0)
return x;
else
return reinterpret_cast<vec<T, N>*>(x);
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/vec.hpp>
namespace migraphx {
template <class T>
constexpr auto tensor_vec_size(T)
{
return vec_size<typename T::type>();
}
template <index_int N, class Shape>
constexpr auto as_vec_shape(Shape s)
{
auto lens = transform(s.lens, s.strides, [](auto len, auto stride) {
if(stride == 1)
return len / N;
else
return len;
});
auto strides = transform(s.strides, [](auto stride) {
if(stride == 1)
return stride;
return stride / N;
});
MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
return make_shape(lens, strides);
}
template <index_int N, class T>
__device__ __host__ auto as_vec(T x)
{
if constexpr(N == 0)
return x;
else
return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
}
template <index_int N, class T, class Axis>
constexpr auto tensor_step(T x, Axis)
{
if constexpr(N == 0)
{
return x;
}
else
{
constexpr auto s = decltype(x.get_shape()){};
MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
return sequence(x.get_shape().lens.size(), [&](auto... is) {
auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
constexpr auto axis = Axis{};
if(j == axis)
return i / N;
else
return i;
});
return make_tensor_view(x.data(), make_shape(lens, s.strides));
});
}
}
template <class IntegralConstant, class T>
__device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
{
return as_vec<ic>(x);
}
template <class... Shapes>
constexpr index_int find_vector_axis(Shapes... ss)
{
index_int axis = 0;
bool b = false;
by([&](auto s) {
if(s.broadcasted() or b)
return;
auto it = find(s.strides.begin(), s.strides.end(), 1);
if(it == s.strides.end())
return;
axis = it - s.strides.begin();
b = true;
})(ss...);
return axis;
}
template <index_int N, class Axis, class... Shapes>
constexpr auto is_vectorizable(Axis axis, Shapes... ss)
{
return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and
...);
}
template <index_int N, class... Shapes>
constexpr bool is_vectorizable(Shapes... ss)
{
return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...);
}
template <class P>
constexpr auto find_vectorize_size(P pred)
{
if constexpr(pred(_c<4>))
return _c<4>;
else if constexpr(pred(_c<2>))
return _c<2>;
else
return _c<0>;
}
template <class T>
__host__ __device__ auto vectorize(T x)
{
if constexpr(vec_size<T>() == 0)
{
constexpr auto n =
find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
return as_vec<n>(x);
}
else
{
return x;
}
}
inline __device__ __host__ auto auto_vectorize()
{
return [](auto... xs) {
return [=](auto f) {
// TODO: Just check there a single axis of 1
constexpr bool packed_or_broadcasted =
((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
if constexpr(packed_or_broadcasted)
{
constexpr auto axis = find_vector_axis(xs.get_shape()...);
constexpr auto n = find_vectorize_size(
[&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
by(
[&](auto x) {
constexpr auto s = x.get_shape();
if constexpr(s.strides[axis] == 0)
return tensor_step<n>(x, axis);
else
return as_vec<n>(x);
},
f)(xs...);
}
else
{
f(xs...);
}
};
};
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#include <migraphx/run_loop.hpp>
#include <migraphx/gpu/loop.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/fill.hpp>
#include <unordered_map>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
{
auto input_num = (inputs.size() - 2) / 2;
inputs.erase(inputs.begin() + input_num, inputs.end());
return op.compute_shape(inputs, std::move(mods));
}
struct gpu_loop
{
int64_t max_iterations = 0;
template <class T>
void copy(context& ctx, const argument& src, T& dst) const
{
argument arg_dst{src.get_shape(), &dst};
copy_from_gpu(ctx, src, arg_dst);
}
template <class T>
void copy(context& ctx, T src, const argument& dst) const
{
argument arg_src{dst.get_shape(), &src};
copy_to_gpu(ctx, arg_src, dst);
}
void append(const std::vector<argument>&, const std::vector<argument>&, int) const {}
void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
{
if(iter >= max_iterations)
return;
auto elem_num = max_iterations - iter;
for(const auto& out : concatenated_outputs)
{
auto s = out.get_shape();
auto size = s.bytes() / max_iterations;
auto lens = s.lens();
lens[0] = elem_num;
shape ss{s.type(), lens};
assert(ss.bytes() + iter * size <= out.get_shape().bytes());
device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
}
}
std::unordered_map<std::string, int> get_output_params(const module& m) const
{
auto get_output_index = [](const std::string& name) {
std::string out_prefix = "#output_";
auto loc = name.find(out_prefix);
if(loc != std::string::npos)
{
int index = std::stoi(name.substr(loc + out_prefix.size()));
return index;
}
return -1;
};
const auto& param_names = m.get_parameter_names();
std::unordered_map<std::string, int> result;
for(const auto& name : param_names)
{
auto index = get_output_index(name);
if(index == -1)
continue;
result[name] = index;
}
return result;
}
};
argument
hip_loop::compute(context& ctx,
const shape&,
const std::vector<argument>& args,
const std::vector<module_ref>& mods,
const std::function<std::vector<argument>(
module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
{
return run_loop(gpu_loop{op.max_iterations}, ctx, args, mods, run);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <iterator>
#include <migraphx/gpu/lowering.hpp> #include <migraphx/gpu/lowering.hpp>
#include <migraphx/manage_ptr.hpp> #include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp> #include <migraphx/instruction.hpp>
...@@ -37,6 +38,7 @@ ...@@ -37,6 +38,7 @@
#include <migraphx/gpu/quant_convolution.hpp> #include <migraphx/gpu/quant_convolution.hpp>
#include <migraphx/gpu/rocblas.hpp> #include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/unary_not.hpp> #include <migraphx/gpu/unary_not.hpp>
#include <migraphx/gpu/where.hpp>
#include <migraphx/iterator_for.hpp> #include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp> #include <migraphx/program.hpp>
#include <utility> #include <utility>
...@@ -149,6 +151,7 @@ struct miopen_apply ...@@ -149,6 +151,7 @@ struct miopen_apply
add_generic_op("sub"); add_generic_op("sub");
add_generic_op("tan"); add_generic_op("tan");
add_generic_op("tanh"); add_generic_op("tanh");
add_generic_op("where");
add_extend_op("abs"); add_extend_op("abs");
add_extend_op("argmax"); add_extend_op("argmax");
...@@ -175,6 +178,7 @@ struct miopen_apply ...@@ -175,6 +178,7 @@ struct miopen_apply
add_extend_op("rnn_var_sl_shift_sequence"); add_extend_op("rnn_var_sl_shift_sequence");
add_extend_op("scatter"); add_extend_op("scatter");
add_extend_op("softmax"); add_extend_op("softmax");
add_extend_op("topk");
add_gemm_op<op::dot>("dot"); add_gemm_op<op::dot>("dot");
add_gemm_op<op::quant_dot>("quant_dot"); add_gemm_op<op::quant_dot>("quant_dot");
...@@ -184,6 +188,7 @@ struct miopen_apply ...@@ -184,6 +188,7 @@ struct miopen_apply
add_batch_norm_inference_op(); add_batch_norm_inference_op();
add_neg_op(); add_neg_op();
add_if_op(); add_if_op();
add_loop_op();
} }
void copy_params() void copy_params()
...@@ -196,6 +201,10 @@ struct miopen_apply ...@@ -196,6 +201,10 @@ struct miopen_apply
if(ins->name() != "@param") if(ins->name() != "@param")
continue; continue;
// parameter no outputs, no need to insert copy to gpu
if(ins->outputs().empty())
continue;
auto pos = std::next(ins); auto pos = std::next(ins);
auto a = insert_allocation(pos, ins->get_shape()); auto a = insert_allocation(pos, ins->get_shape());
auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a); auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
...@@ -422,7 +431,7 @@ struct miopen_apply ...@@ -422,7 +431,7 @@ struct miopen_apply
}); });
} }
// replace the if operator with gpu_if operator // add input and output argument for the if operator
void add_if_op() void add_if_op()
{ {
apply_map.emplace("if", [=](instruction_ref ins) { apply_map.emplace("if", [=](instruction_ref ins) {
...@@ -461,9 +470,48 @@ struct miopen_apply ...@@ -461,9 +470,48 @@ struct miopen_apply
return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args); return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
}); });
} }
// replace the loop operator with gpu_loop operator
void add_loop_op()
{
apply_map.emplace("loop", [=](instruction_ref ins) {
std::vector<instruction_ref> inputs = ins->inputs();
// copy max_iter from gpu to cpu
auto cpu_max_iter =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
auto cpu_cond =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
auto synced_max_iter =
mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
inputs.at(0) = synced_max_iter;
inputs.at(1) = cpu_cond;
auto copy_inputs = inputs;
std::transform(
copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
return mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
});
auto mod_args = ins->module_inputs();
auto output = insert_allocation(ins, ins->get_shape());
const auto* sub_mod = mod_args.front();
auto cond_out = mod->insert_instruction(
ins,
make_op("hip::allocate",
{{"shape", to_value(sub_mod->get_output_shapes().front())}}));
// add cond and mod outputs to the argument list
inputs.push_back(cond_out);
inputs.push_back(output);
return mod->replace_instruction(
ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
});
}
}; };
void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); } void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
...@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins) ...@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
auto t_in = in0->inputs().front(); auto t_in = in0->inputs().front();
auto p_in = pad_ins(m, t_in, offset); auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>(); auto dims = val.at("dims").to_vector<int64_t>();
auto r_in = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in); auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in); ret_inputs.push_back(r_in);
} }
else else
...@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins) ...@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
auto t_in = in1->inputs().front(); auto t_in = in1->inputs().front();
auto p_in = pad_ins(m, t_in, offset); auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>(); auto dims = val.at("dims").to_vector<int64_t>();
auto r_in = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in); auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in); ret_inputs.push_back(r_in);
} }
else else
......
File mode changed from 100644 to 100755
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <migraphx/rewrite_rnn.hpp> #include <migraphx/rewrite_rnn.hpp>
#include <migraphx/schedule.hpp> #include <migraphx/schedule.hpp>
#include <migraphx/simplify_algebra.hpp> #include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp> #include <migraphx/simplify_reshapes.hpp>
#include <migraphx/gpu/allocation_model.hpp> #include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/concat_gpu_opt.hpp> #include <migraphx/gpu/concat_gpu_opt.hpp>
...@@ -60,6 +61,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -60,6 +61,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
normalize_ops{}, normalize_ops{},
decompose{}, decompose{},
dead_code_elimination{}, dead_code_elimination{},
simplify_qdq{},
rewrite_quantization{}, rewrite_quantization{},
dead_code_elimination{}, dead_code_elimination{},
eliminate_data_type{unsupported_types, shape::type_t::float_type}, eliminate_data_type{unsupported_types, shape::type_t::float_type},
...@@ -91,10 +93,10 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -91,10 +93,10 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{}, dead_code_elimination{},
eliminate_concat{concat_gpu_optimization{}}, eliminate_concat{concat_gpu_optimization{}},
dead_code_elimination{}, dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
pack_int8_args{}, pack_int8_args{},
dead_code_elimination{}, dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
fuse_ops{&ctx, options.fast_math}, fuse_ops{&ctx, options.fast_math},
dead_code_elimination{}, dead_code_elimination{},
write_literals{&ctx}, write_literals{&ctx},
......
#include <migraphx/gpu/topk.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/topk.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_topk::compute_shape(std::vector<shape> inputs) const
{
return op.normalize_compute_shape({inputs.front()});
}
argument hip_topk::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
auto outputs = args.back().get_sub_objects();
return op.largest ? device::topk_largest(ctx.get_stream().get(),
outputs.front(),
outputs.back(),
args[0],
op.k,
op.axis)
: device::topk_smallest(ctx.get_stream().get(),
outputs.front(),
outputs.back(),
args[0],
op.k,
op.axis);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment