Commit be5f3539 authored by Shucai Xiao's avatar Shucai Xiao
Browse files

merge develop branch changes

parents 7e3bdc34 ebfe9735
#include <migraphx/gpu/argmax.hpp>
#include <migraphx/gpu/device/argmax.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2).standard();
return op.compute_shape({inputs.at(0)});
}
argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::argmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
return args.back();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/argmin.hpp>
#include <migraphx/gpu/device/argmin.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2).standard();
return op.compute_shape({inputs.at(0)});
}
argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::argmin(ctx.get_stream().get(), args.back(), args.front(), op.axis);
return args.back();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/argmax.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/arg_op.hpp>
#include <migraphx/gpu/hip.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
{
arg_op(argmax_op{}, stream, result, arg, axis);
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/argmin.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/arg_op.hpp>
#include <migraphx/gpu/hip.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
{
arg_op(argmin_op{}, stream, result, arg, axis);
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -10,22 +10,22 @@ namespace gpu {
namespace device {
argument concat(hipStream_t stream,
const migraphx::shape& output_shape,
const migraphx::shape&,
std::vector<migraphx::argument> args,
std::vector<std::size_t> offsets)
{
for(std::size_t l = 0; l < args.size() - 1; l++)
auto ninputs = args.size() - 1;
for(std::size_t j = 0; j < ninputs; j++)
{
auto argl = args[l];
std::size_t nelements = argl.get_shape().elements();
visit_all(args.back(), argl)([&](auto output, auto input) {
visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
auto* outptr = output.data() + offsets[l];
const auto* inptr = input.data();
hip_tensor_descriptor<ndim> desc_input(input.get_shape());
hip_tensor_descriptor<ndim> desc_output(output.get_shape());
gs_launch(stream, nelements)(
[=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
auto&& arg = args[j];
std::size_t nelements = arg.get_shape().elements();
auto offset = offsets[j];
shape arg_shape{arg.get_shape().type(), arg.get_shape().lens()};
hip_visit_all(args.back(), arg, arg_shape)([&](auto output, auto input, auto input_shape) {
gs_launch(stream, nelements)([=](auto i) {
auto input_idx = input_shape.multi(i);
auto idx = output.get_shape().index(input_idx);
output.data()[idx + offset] = input[input_idx];
});
});
}
......
#include <migraphx/gpu/device/erf.hpp>
#include <migraphx/gpu/device/nary.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void erf(hipStream_t stream, const argument& result, const argument& arg)
{
nary(stream, result, arg)([](auto x) { return ::erf(to_hip_type(x)); });
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -11,35 +11,30 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
argument gather(hipStream_t stream,
const migraphx::shape& output_shape,
std::vector<migraphx::argument> args,
int axis)
argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int axis)
{
auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
visit_all(args.back(), args[0])([&](auto output, auto input) {
std::size_t nelements = output_shape.elements();
args[1].visit([&](auto indices) {
const auto* indices_ptr = device_cast(indices.data());
auto* out_ptr = device_cast(output.data());
const auto* in_ptr = device_cast(input.data());
auto& input_shape = args[0].get_shape();
auto lens = input_shape.lens();
lens[axis_index] = args[1].get_shape().elements();
migraphx::shape out_comp_shape{output_shape.type(), lens};
visit_tensor_size(out_comp_shape.lens().size(), [&](auto n_out_dim) {
hip_tensor_descriptor<n_out_dim> desc_input(input_shape);
hip_tensor_descriptor<n_out_dim> desc_output(out_comp_shape);
gs_launch(stream, nelements)([=](auto ii) {
auto in_idx = desc_output.multi(ii);
in_idx[axis_index] = indices_ptr[in_idx[axis_index]];
out_ptr[ii] = in_ptr[desc_input.linear(in_idx)];
auto axis_index = (axis < 0) ? (axis + arg1.get_shape().lens().size()) : axis;
auto& input_shape = arg1.get_shape();
auto lens = input_shape.lens();
lens[axis_index] = arg2.get_shape().elements();
shape out_comp_shape{result.get_shape().type(), lens};
std::size_t nelements = result.get_shape().elements();
visit_all(result, arg1)([&](auto output, auto input_v) {
hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {
arg2.visit([&](auto indices) {
const auto* indices_ptr = device_cast(indices.data());
auto* output_ptr = device_cast(output.data());
gs_launch(stream, nelements)([=](auto i) {
auto idx = out_comp.multi(i);
idx[axis_index] = indices_ptr[idx[axis_index]];
output_ptr[i] = input[idx];
});
});
});
});
return args.back();
return result;
}
} // namespace device
......
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#include <migraphx/gpu/device/types.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class T, std::size_t N>
struct hip_array
{
T d[N];
MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
MIGRAPHX_DEVICE_CONSTEXPR std::integral_constant<std::size_t, N> size() const { return {}; }
MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
MIGRAPHX_DEVICE_CONSTEXPR T dot(const hip_array& x) const
{
T result = 0;
for(std::size_t i = 0; i < N; i++)
result += x[i] * d[i];
return result;
}
MIGRAPHX_DEVICE_CONSTEXPR T product() const
{
T result = 1;
for(std::size_t i = 0; i < N; i++)
result *= d[i];
return result;
}
friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator*(const hip_array& x, const hip_array& y)
{
hip_array result;
for(std::size_t i = 0; i < N; i++)
result[i] = x[i] * y[i];
return result;
}
friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator+(const hip_array& x, const hip_array& y)
{
hip_array result{};
for(std::size_t i = 0; i < N; i++)
result[i] = x[i] + y[i];
return result;
}
};
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -11,9 +11,33 @@ namespace device {
struct index
{
std::size_t global;
std::size_t local;
std::size_t group;
std::size_t global = 0;
std::size_t local = 0;
std::size_t group = 0;
__device__ std::size_t nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
__device__ std::size_t nlocal() const { return blockDim.x; } // NOLINT
template <class F>
__device__ void global_stride(std::size_t n, F f) const
{
const auto stride = nglobal();
for(std::size_t i = global; i < n; i += stride)
{
f(i);
}
}
template <class F>
__device__ void local_stride(std::size_t n, F f) const
{
const auto stride = nlocal();
for(std::size_t i = local; i < n; i += stride)
{
f(i);
}
}
};
template <class F>
......@@ -35,18 +59,26 @@ inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
};
}
template <class F>
__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index idx) -> decltype(f(i, idx))
{
return f(i, idx);
}
template <class F>
__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i))
{
return f(i);
}
inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
{
std::size_t groups = 1 + n / local;
std::size_t groups = (n + local - 1) / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return [=](auto f) {
launch(stream, nglobal, local)([=](auto idx) {
for(size_t i = idx.global; i < n; i += nglobal)
{
f(i);
}
});
launch(stream, nglobal, local)(
[=](auto idx) { idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); }); });
};
}
......
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
struct sum
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
{
return x + y;
}
};
struct id
{
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
{
return x;
}
};
struct max
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
{
return x > y ? x : y;
}
};
struct min
{
template <class T, class U>
MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
{
return x < y ? x : y;
}
};
struct lowest
{
template <class T>
operator T() const
{
return device_cast(std::numeric_limits<host_type<T>>::lowest());
}
};
struct highest
{
template <class T>
operator T() const
{
return device_cast(std::numeric_limits<host_type<T>>::max());
}
};
#ifdef MIGRAPHX_NO_DPP
template <std::size_t N, class Op, class T, class F>
__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
{
using type = decltype(f(idx.local));
MIGRAPHX_DEVICE_SHARED type buffer[N];
type x = init;
idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
buffer[idx.local] = x;
__syncthreads();
for(std::size_t s = 1; s < idx.nlocal(); s *= 2)
{
const std::size_t index = 2 * s * idx.local;
if(index + s < idx.nlocal())
{
buffer[index] = op(buffer[index], buffer[index + s]);
}
__syncthreads();
}
return buffer[0];
}
#else
constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
constexpr unsigned int dpp_row_bcast(unsigned int x)
{
unsigned int y = 0;
switch(x)
{
case 15: y = 0x142; break;
case 31: y = 0x143; break;
default: throw std::runtime_error("Unknown bcast");
}
return y;
}
template <unsigned int DppCtrl,
unsigned int RowMask = 0xf,
unsigned int BankMask = 0xf,
bool BoundCtrl = false,
class T>
__device__ T dpp_mov(T& x)
{
static const std::size_t n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
union type
{
uint32_t reg[n];
T data;
};
type output{};
type input{};
// cppcheck-suppress unreadVariable
input.data = x;
for(std::size_t i = 0; i < n; i++)
{
output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
}
return output.data;
}
template <class T, class Op>
__device__ void dpp_reduce(T& in, Op op)
{
T out{};
out = dpp_mov<dpp_row_shr(1)>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(2)>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
in = op(in, out);
out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
in = op(in, out);
out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
in = op(in, out);
out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
in = op(in, out);
}
__device__ inline void dpp_reduce(float& x, sum)
{
#ifdef MIGRAPHX_USE_CLANG_TIDY
(void)x;
#else
__asm__ volatile("s_nop 4\n"
"v_add_f32 %0 %0 %0 row_shr:1\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:2\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
"s_nop 1\n"
: "=v"(x)
: "0"(x));
#endif
}
template <std::size_t N, class Op, class T, class F>
__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
{
using type = decltype(f(idx.local));
MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
type x = init;
idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
dpp_reduce(x, op);
const auto ldsidx = idx.local / 64;
if((idx.local % 64) == 63)
{
buffer[ldsidx] = x;
}
__syncthreads();
type y = init;
for(std::size_t i = 0; i < idx.nlocal() / 64; i++)
{
y = op(y, buffer[i]);
}
return y;
}
#endif
constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
{
size_t block_size = 64;
while(block_size < max_block_size and block_size < n)
block_size *= 2;
return block_size;
}
template <class Op, class T, class Input, class Output>
void reduce(hipStream_t stream,
const argument& result,
const argument& arg,
Op op,
T init,
Input read_input,
Output read_output)
{
auto&& output_shape = result.get_shape();
auto&& input_shape = arg.get_shape();
std::vector<std::size_t> reduce_lens;
std::transform(output_shape.lens().begin(),
output_shape.lens().end(),
input_shape.lens().begin(),
std::back_inserter(reduce_lens),
[](auto x, auto y) -> std::size_t {
if(x == y)
return 1;
else
return y;
});
shape reduce_slice{output_shape.type(), reduce_lens};
hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
auto nelements = result.get_shape().elements();
auto relements = reduce_slice.elements();
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(relements, max_block_size);
gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
const auto out_idx = i / block_size;
auto base_idx = output.get_shape().multi(out_idx);
auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
auto reduce_idx = reduce_shape.multi(j);
return read_input(input[reduce_idx + base_idx]);
});
if(idx.local == 0)
output.data()[out_idx] = read_output(r);
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#include <migraphx/gpu/device/array.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <std::size_t N>
struct hip_shape
{
using hip_index = hip_array<std::size_t, N>;
hip_array<std::size_t, N> lens = {};
hip_array<std::size_t, N> strides = {};
bool standard = false;
__device__ __host__ hip_shape() = default;
hip_shape(const shape& s) : standard(s.standard())
{
assert(s.lens().size() == N);
assert(s.strides().size() == N);
std::copy(s.lens().begin(), s.lens().end(), lens.begin());
std::copy(s.strides().begin(), s.strides().end(), strides.begin());
}
MIGRAPHX_DEVICE_CONSTEXPR std::size_t elements() const { return lens.product(); }
MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(hip_index x) const { return x.dot(strides); }
MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::initializer_list<std::size_t> x) const
{
std::size_t idx = 0;
for(std::size_t i = 0; i < x.size(); i++)
idx += *(x.begin() + i) * strides[i];
return idx;
}
MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::size_t i) const
{
if(this->standard)
return i;
else
{
const std::size_t rank = this->lens.size();
std::size_t s = 1;
std::size_t result = 0;
for(std::size_t j = 0; j < this->lens.size(); j++)
{
const std::size_t k = rank - j - 1;
const std::size_t stride = this->strides[k];
const std::size_t len = this->lens[k];
const std::size_t slen = s * len;
const std::size_t idx = (i % slen) / s;
result += stride * idx;
s = slen;
}
return result;
}
}
MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(std::size_t idx) const
{
hip_index result;
std::size_t tidx = idx;
for(std::size_t is = 0; is < result.size(); is++)
{
result[is] = tidx / strides[is];
tidx = tidx % strides[is];
}
return result;
}
MIGRAPHX_DEVICE_CONSTEXPR hip_index carry(hip_index result) const
{
std::ptrdiff_t rem = 0;
for(std::ptrdiff_t i = result.size() - 1; i >= 0; i--)
{
auto z = result[i] + rem;
rem = z - std::ptrdiff_t(lens[i]) + 1;
if(rem > 0)
z -= rem;
else
rem = 0;
result[i] = z;
}
return result;
}
};
template <std::size_t N>
hip_shape<N> make_hip_shape(const shape& x)
{
return x;
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#include <hip/hip_runtime.h>
#include <migraphx/functional.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class F>
void visit_tensor_size(std::size_t n, F f)
{
switch(n)
{
case 1:
{
f(std::integral_constant<std::size_t, 1>{});
break;
}
case 2:
{
f(std::integral_constant<std::size_t, 2>{});
break;
}
case 3:
{
f(std::integral_constant<std::size_t, 3>{});
break;
}
case 4:
{
f(std::integral_constant<std::size_t, 4>{});
break;
}
case 5:
{
f(std::integral_constant<std::size_t, 5>{});
break;
}
default: throw std::runtime_error("Unknown tensor size");
}
}
template <std::size_t NDim>
using hip_tensor_index = hip_array<std::size_t, NDim>;
template <size_t NDim>
struct hip_index
{
size_t d[NDim];
__device__ __host__ size_t& operator[](size_t i) { return d[i]; }
__device__ __host__ size_t operator[](size_t i) const { return d[i]; }
};
template <size_t NDim>
template <std::size_t NDim>
struct hip_tensor_descriptor
{
__device__ __host__ hip_tensor_descriptor() = default;
......@@ -63,12 +22,11 @@ struct hip_tensor_descriptor
std::copy(s.strides().begin(), s.strides().end(), strides);
}
__device__ __host__ hip_index<NDim> multi(size_t idx) const
__device__ __host__ hip_tensor_index<NDim> multi(std::size_t idx) const
{
hip_index<NDim> result{};
size_t tidx = idx;
for(size_t is = 0; is < NDim; is++)
hip_tensor_index<NDim> result{};
std::size_t tidx = idx;
for(std::size_t is = 0; is < NDim; is++)
{
result[is] = tidx / strides[is];
tidx = tidx % strides[is];
......@@ -76,16 +34,15 @@ struct hip_tensor_descriptor
return result;
}
__device__ __host__ size_t linear(hip_index<NDim> s) const
__device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const
{
size_t idx = 0;
for(size_t i = 0; i < NDim; i++)
std::size_t idx = 0;
for(std::size_t i = 0; i < NDim; i++)
idx += s[i] * strides[i];
return idx;
}
size_t lens[NDim] = {};
size_t strides[NDim] = {};
std::size_t lens[NDim] = {};
std::size_t strides[NDim] = {};
};
} // namespace device
......
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#include <migraphx/gpu/device/shape.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class T, std::size_t N>
struct hip_tensor_view
{
using value_type = T;
using hip_index = typename hip_shape<N>::hip_index;
__device__ __host__ hip_tensor_view() = default;
__host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
__host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return s.elements(); }
MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
template <class U>
MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
{
return d[s.index(i)];
}
MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
private:
value_type* d = nullptr;
hip_shape<N> s{};
};
template <std::size_t N, class T>
hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
{
return {x, s};
}
template <std::size_t N, class T>
hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
{
return {x};
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -8,14 +8,45 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#include <hip/hip_runtime.h>
#include <migraphx/half.hpp>
#include <migraphx/config.hpp>
#include <migraphx/tensor_view.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template <class T, std::size_t N>
using vec = T __attribute__((ext_vector_type(N)));
template <std::size_t N, class T>
__device__ __host__ T* as_pointer(vec<T, N>* x)
{
return reinterpret_cast<T*>(x);
}
template <std::size_t N, class T>
__device__ __host__ vec<T, N>* as_vec(T* x)
{
return reinterpret_cast<vec<T, N>*>(x);
}
template <std::size_t N, class T>
tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
{
return {x.get_shape(), as_vec<N>(x.data())};
}
template <std::size_t N, class... Ts>
auto pack_vec(Ts... xs)
{
return [=](auto f, std::size_t n) { return f(as_vec<N>(xs)[n]...); };
}
using gpu_half = __fp16;
namespace detail {
......@@ -25,6 +56,12 @@ struct device_type
using type = T;
};
template <class T, std::size_t N>
struct device_type<vec<T, N>>
{
using type = vec<typename device_type<T>::type, N>;
};
template <>
struct device_type<half>
{
......@@ -38,7 +75,7 @@ struct host_type
};
template <>
struct device_type<gpu_half>
struct host_type<gpu_half>
{
using type = half;
};
......@@ -54,7 +91,7 @@ using device_type = typename detail::device_type<T>::type;
template <class T>
host_type<T> host_cast(T x)
{
return reinterpret_cast<host_type<T>>(x);
return reinterpret_cast<const host_type<T>&>(x);
}
template <class T>
......@@ -64,9 +101,9 @@ host_type<T>* host_cast(T* x)
}
template <class T>
device_type<T> device_cast(T x)
device_type<T> device_cast(const T& x)
{
return reinterpret_cast<device_type<T>>(x);
return reinterpret_cast<const device_type<T>&>(x);
}
template <class T>
......@@ -76,13 +113,19 @@ device_type<T>* device_cast(T* x)
}
template <class T>
T to_hip_type(T x)
tensor_view<device_type<T>> device_cast(tensor_view<T> x)
{
return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
}
template <class T>
__device__ __host__ T to_hip_type(T x)
{
return x;
}
// Hip doens't support __fp16
inline float to_hip_type(gpu_half x) { return x; }
inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
} // namespace device
} // namespace gpu
......
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#include <migraphx/gpu/device/types.hpp>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class T, std::size_t N>
struct hip_vector
{
MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
MIGRAPHX_DEVICE_CONSTEXPR hip_vector(std::size_t s) : len(s) {}
template <class Iterator>
__device__ __host__ hip_vector(Iterator start, Iterator last)
{
auto it = std::copy(start, last, d);
len = std::distance(d, it);
}
__device__ __host__ hip_vector(std::initializer_list<T> x)
{
std::copy(x.begin(), x.end(), d);
len = x.size();
}
MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return len; }
MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
template <class U>
MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
{
d[len] = static_cast<U&&>(x);
len++;
}
private:
T d[N] = {};
std::size_t len = 0;
};
template <std::size_t N, class T>
hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
{
hip_vector<T, N> result(x.size());
std::copy(x.begin(), x.end(), result.begin());
return result;
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
#include <migraphx/gpu/device/tensor_view.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class F>
void visit_tensor_size(std::size_t n, F f)
{
switch(n)
{
case 1:
{
f(std::integral_constant<std::size_t, 1>{});
break;
}
case 2:
{
f(std::integral_constant<std::size_t, 2>{});
break;
}
case 3:
{
f(std::integral_constant<std::size_t, 3>{});
break;
}
case 4:
{
f(std::integral_constant<std::size_t, 4>{});
break;
}
case 5:
{
f(std::integral_constant<std::size_t, 5>{});
break;
}
default: throw std::runtime_error("Unknown tensor size");
}
}
inline shape get_shape(const shape& x) { return x; }
template <class T>
auto get_shape(const T& x) -> decltype(x.get_shape())
{
return x.get_shape();
}
template <class V, class F, class... Ts>
void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
{
std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
if(!std::all_of(
types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
MIGRAPHX_THROW("Types must be the same");
std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
if(!std::all_of(
ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
MIGRAPHX_THROW("Ranks must be the same");
visit_tensor_size(s.lens().size(),
[&](auto ndim) { s.visit_type([&](auto as) { v(f(xs, ndim, as)...); }); });
}
template <class V, class F, class... Ts>
void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
{
std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
if(!std::all_of(
ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
MIGRAPHX_THROW("Ranks must be the same");
visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
}
template <class F>
struct hip_convert
{
F f;
template <class RawData, class N, class As>
auto operator()(RawData x, N ndim, As as) const
-> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
{
return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
}
template <class N, class As>
auto operator()(const shape& s, N ndim, As) const
{
return make_hip_shape<ndim>(s);
}
};
template <class F>
hip_convert<F> make_hip_convert(F f)
{
return {f};
}
template <class F>
struct hip_convert_view
{
F f;
template <class T, class N>
auto operator()(tensor_view<T> x, N ndim) const
{
return make_hip_view<ndim>(f(x));
}
template <class N>
auto operator()(const shape& s, N ndim) const
{
return make_hip_shape<ndim>(s);
}
};
template <class F>
hip_convert_view<F> make_hip_convert_view(F f)
{
return {f};
}
template <class T, class... Ts>
auto hip_visit_all(T&& x, Ts&&... xs)
{
return [&](auto f) {
hip_visit_all_impl(
get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
};
}
template <std::size_t N, class T, class... Ts>
auto hip_vec_visit_all(T&& x, Ts&&... xs)
{
return [&](auto f) {
hip_visit_all_impl(get_shape(x),
make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
f,
x,
xs...);
};
}
template <class T, class... Ts>
auto hip_pointer_visit_all(T&& x, Ts&&... xs)
{
return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
}
template <class T, class... Ts>
auto hip_visit_views(T&& x, Ts&&... xs)
{
return [&](auto f) {
hip_visit_views_impl(get_shape(x),
make_hip_convert_view([](auto v) { return device_cast(v); }),
f,
x,
xs...);
};
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/logsoftmax.hpp>
#include <migraphx/gpu/device/reduce.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
......@@ -11,66 +12,45 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
argument logsoftmax(hipStream_t stream,
const migraphx::shape& output_shape,
std::vector<migraphx::argument> args,
int axis)
void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
{
auto lens = output_shape.lens();
auto num_in_batch = lens[axis];
auto batch_lens = lens;
batch_lens[axis] = 1;
migraphx::shape batch_shape{output_shape.type(), batch_lens};
visit_all(args.back(), args.front())([&](auto output, auto input) {
const auto* input_ptr = device_cast(input.data());
auto* output_ptr = device_cast(output.data());
visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
hip_tensor_descriptor<n_dim> desc_data(output_shape);
// each thread is for one item in the batch
gs_launch(stream, batch_shape.elements())([=](auto i) {
auto batch_idx = desc_batch.multi(i);
auto data_idx = batch_idx;
// get max
auto batch_max = input_ptr[desc_data.linear(batch_idx)];
for(std::size_t j = 1; j < num_in_batch; ++j)
{
auto lens = result.get_shape().lens();
auto batch_lens = lens;
std::size_t batch_item_num = lens[axis];
batch_lens[axis] = 1;
migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(batch_item_num, max_block_size);
gs_launch(stream,
batch_shape.elements() * block_size,
block_size)([=](auto i, auto idx) __device__ {
auto data_idx = batch.multi(i / block_size);
using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
type init = lowest();
auto batch_max = block_reduce<max_block_size>(
idx, max{}, init, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
size_t idx = desc_data.linear(data_idx);
batch_max = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[idx]));
}
return input[data_idx];
});
for(std::size_t j = 0; j < num_in_batch; ++j)
{
data_idx[axis] = j;
size_t idx = desc_data.linear(data_idx);
output_ptr[idx] = input_ptr[idx] - batch_max;
}
auto batch_sum = ::exp(to_hip_type(output_ptr[desc_data.linear(batch_idx)]));
for(std::size_t j = 1; j < num_in_batch; ++j)
{
auto batch_sum =
block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
size_t idx = desc_data.linear(data_idx);
batch_sum += ::exp(to_hip_type(output_ptr[idx]));
}
batch_sum = ::log(to_hip_type(batch_sum));
auto val = input[data_idx] - batch_max;
return ::exp(to_hip_type(val));
});
for(std::size_t j = 0; j < num_in_batch; ++j)
{
data_idx[axis] = j;
size_t idx = desc_data.linear(data_idx);
output_ptr[idx] -= batch_sum;
}
auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
idx.local_stride(batch_item_num, [&](auto j) {
data_idx[axis] = j;
output[data_idx] = input[data_idx] - log_batch_sum;
});
});
});
return args.back();
}
} // namespace device
......
......@@ -4,6 +4,7 @@
#include <migraphx/gpu/device/pad.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/float_equal.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -14,24 +15,26 @@ argument
pad(hipStream_t stream, argument result, argument arg1, float value, std::vector<std::int64_t> pads)
{
std::size_t nelements = arg1.get_shape().elements();
hip_visit_all(result, arg1)([&](auto output, auto input) {
using type = typename decltype(output)::value_type;
using hip_index = typename decltype(output)::hip_index;
type device_val = value;
if(float_equal(value, std::numeric_limits<float>::lowest()))
{
device_val = device_cast(std::numeric_limits<type>::lowest());
}
gs_launch(stream,
result.get_shape().elements())([=](auto i) { output.data()[i] = device_val; });
nary(stream, result)([=] { return value; });
visit_all(result, arg1)([&](auto output, auto input) {
visit_tensor_size(result.get_shape().lens().size(), [&](auto ndim) {
std::size_t offsets[ndim];
std::copy(pads.begin(), pads.begin() + ndim, offsets);
auto* outptr = output.data();
const auto* inptr = input.data();
hip_tensor_descriptor<ndim> desc_input(input.get_shape());
hip_tensor_descriptor<ndim> desc_output(output.get_shape());
gs_launch(stream, nelements)([=](auto i) {
auto idx = desc_input.multi(i);
for(std::size_t j = 0; j < ndim; j++)
{
idx[j] += offsets[j];
}
outptr[desc_output.linear(idx)] = inptr[i];
});
hip_index offsets;
std::copy(pads.begin(), pads.begin() + offsets.size(), offsets.begin());
gs_launch(stream, nelements)([=](auto i) {
auto idx = input.get_shape().multi(i);
for(std::size_t j = 0; j < offsets.size(); j++)
{
idx[j] += offsets[j];
}
output[idx] = input.data()[i];
});
});
return result;
......
#include <migraphx/gpu/device/reduce_sum.hpp>
#include <migraphx/gpu/device/reduce.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void reduce_sum(hipStream_t stream, const argument& result, const argument& arg)
{
reduce(stream, result, arg, sum{}, 0, id{}, id{});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment