Commit 7e297b13 authored by Paul's avatar Paul
Browse files

Merge

parents 86ea5e91 aa7ff911
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
static const char* const pointwise_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
#include <args.hpp>
using namespace migraphx;
extern "C" {
__global__ void kernel(${params})
{
pointwise(${lambda}, ${args});
}
}
int main() {}
)__migraphx__";
std::string enum_params(std::size_t count, std::string param)
{
std::vector<std::string> items(count);
transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
return join_strings(items, ",");
}
std::size_t compute_global(std::size_t n, std::size_t local = 1024)
{
std::size_t groups = (n + local - 1) / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return nglobal;
}
operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
{
hip_compile_options options;
options.global = compute_global(inputs.front().elements());
options.local = 1024;
options.inputs = inputs;
options.output = inputs.back();
options.reduced_inputs = reduce_dims(inputs);
auto src = interpolate_string(pointwise_kernel,
{{"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")},
{"lambda", lambda}});
return compile_hip_code_object(src, options);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/compiler.hpp>
#include <utility>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
auto& compiler_map()
{
static std::unordered_map<std::string, compiler_compile> m; // NOLINT
return m;
}
auto& compiler_op_map()
{
static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
return m;
}
void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
{
compiler_map()[name] = std::move(c);
compiler_op_map()[name] = std::move(cop);
}
bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
{
return compiler_map().at(op.name())(ctx, ins, op);
}
operation
compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
{
return compiler_op_map().at(name)(ctx, inputs, v);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/device/fill.hpp>
#include <migraphx/gpu/device/nary.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void fill(hipStream_t stream, const argument& result, unsigned long val)
{
nary(stream, result)([=]() __device__ { return val; });
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
#include <migraphx/requires.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class... Ts>
using common_type = typename std::common_type<Ts...>::type;
template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
__device__ bool float_equal_device(T x, T y)
{
return std::isfinite(x) and std::isfinite(y) and
std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
std::nextafter(x, std::numeric_limits<T>::max()) >= y;
}
template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
__device__ bool float_equal_device(T x, T y)
{
return x == y;
}
template <class T, class U>
__device__ bool float_equal(T x, U y)
{
return float_equal_device<common_type<T, U>>(x, y);
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(
inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
{
index_int groups = (n + local - 1) / local;
index_int nglobal = std::min<index_int>(256, groups) * local;
index_int groups = (n + local - 1) / local;
// max possible number of blocks is set to 1B (1,073,741,824)
index_int nglobal = std::min<index_int>(1073741824, groups) * local;
return [=](auto f) {
launch(stream, nglobal, local)([=](auto idx) __device__ {
......
......@@ -57,9 +57,10 @@ inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
{
assert(s.standard);
assert(s.elements() > 0);
index_int n = s.elements();
index_int groups = (n + nlocal - 1) / nlocal;
index_int nglobal = std::min<index_int>(128, groups) * nlocal;
index_int n = s.elements();
index_int groups = (n + nlocal - 1) / nlocal;
// max possible number of blocks is set to 1B (1,073,741,824)
index_int nglobal = std::min<index_int>(1073741824, groups) * nlocal;
assert(groups > 0);
assert(nglobal > 0);
......
......@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_NO_DPP
#endif
#ifdef MIGRAPHX_NO_DPP
template <index_int N,
class Op,
......@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
in = op(in, out);
out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
in = op(in, out);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
in = op(in, out);
out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
in = op(in, out);
#endif
}
__device__ inline void dpp_reduce(float& x, sum)
......@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
"s_nop 1\n"
#if __AMDGCN_WAVEFRONT_SIZE == 64
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
"s_nop 1\n"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
#endif
"s_nop 1\n"
: "=v"(x)
: "0"(x));
......@@ -135,21 +135,27 @@ template <index_int N,
MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
__device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
{
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr index_int nthreads = 16;
#else
constexpr index_int nthreads = 64;
#endif
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
type x = init;
fs([&](auto i) { x = op(x, f(i)); });
dpp_reduce(x, op);
const auto ldsidx = idx.local / 64;
if((idx.local % 64) == 63)
const auto ldsidx = idx.local / nthreads;
if((idx.local % nthreads) == nthreads - 1)
{
buffer[ldsidx] = x;
}
__syncthreads();
type y = init;
for(index_int i = 0; i < idx.nlocal() / 64; i++)
for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
{
y = op(y, buffer[i]);
}
......
......@@ -44,12 +44,19 @@ __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input,
template <index_int N, class Op, class T, class Input, class Output>
__device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
{
block_scan<N>(idx,
op,
init,
[&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
input,
output);
block_scan<N>(
idx,
op,
init,
[&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
input,
output);
}
template <class F>
constexpr auto reverse_scan(index_int n, F f)
{
return [=](auto i, auto&&... xs) { return f(n - i - 1, xs...); };
}
} // namespace device
......
......@@ -129,6 +129,21 @@ __device__ __host__ T to_hip_type(T x)
// Hip doens't support __fp16
inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
template <class X> \
struct trait : std::trait<X> \
{ \
}; \
\
template <> \
struct trait<T> : std::true_type \
{ \
};
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -14,28 +14,23 @@ constexpr void visit_tensor_size(index_int n, F f)
{
switch(n)
{
case 1:
{
case 1: {
f(std::integral_constant<index_int, 1>{});
break;
}
case 2:
{
case 2: {
f(std::integral_constant<index_int, 2>{});
break;
}
case 3:
{
case 3: {
f(std::integral_constant<index_int, 3>{});
break;
}
case 4:
{
case 4: {
f(std::integral_constant<index_int, 4>{});
break;
}
case 5:
{
case 5: {
f(std::integral_constant<index_int, 5>{});
break;
}
......@@ -181,7 +176,13 @@ template <index_int N, class T, class... Ts>
auto hip_vec_visit_all(T&& x, Ts&&... xs)
{
return [&](auto f) {
hip_visit_all_impl(get_shape(x),
auto sx = get_shape(x);
auto lens = sx.lens();
assert(lens.back() % N == 0);
assert(sx.strides().back() == 1);
lens.back() /= N;
shape vec_sx{sx.type(), lens};
hip_visit_all_impl(vec_sx,
make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
f,
x,
......
......@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
#else
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
#endif
#endif
template <class T>
struct vector_type
{
......@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
const bool in_range = idx.local < relements_v;
auto mean = [&](auto z) {
return auto_block_reduce<MaxBlockSize>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
value_type(relements);
auto m = auto_block_reduce<MaxBlockSize>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
value_type(relements);
#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
__builtin_amdgcn_s_barrier();
#endif
return m;
};
// m = x - mean(x)
......
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/gpu/device/multinomial.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class Iterator, class T>
constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
{
Iterator it;
typename std::iterator_traits<Iterator>::difference_type count;
typename std::iterator_traits<Iterator>::difference_type step;
count = std::distance(first, last);
while(count > 0)
{
it = first;
step = count / 2;
std::advance(it, step);
if(!(value < *it))
{
first = ++it;
count -= step + 1;
}
else
count = step;
}
return first;
}
void multinomial(hipStream_t stream,
const argument& result,
const argument& arg0,
const argument& arg1)
{
size_t batch_size = arg0.get_shape().lens().front();
size_t class_size = arg0.get_shape().lens().back();
size_t sample_size = result.get_shape().lens().back();
hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
result.visit([&](auto out) {
hip_visit_views(out)([&](auto output) {
gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
auto idx = output.get_shape().multi(i);
auto cdf_begin = cdf.begin() + (idx.front() * class_size);
auto cdf_end = cdf_begin + class_size;
auto sample_iter =
upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
output[i] = std::distance(cdf_begin, sample_iter);
});
});
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/device/nonzero.hpp>
#include <migraphx/gpu/device/float_equal.hpp>
#include <migraphx/gpu/device/scan.hpp>
#include <migraphx/gpu/device/reduce_ops.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
{
auto s = arg_data.get_shape();
auto elem_num = s.elements();
auto out_elem_num = result.get_shape().elements();
// call the prefix_sum function to do a prefix_sum to compute
// index in the output. Only 1 block can be used since we have
// only one prefix sum
const index_int block_size = 256;
hip_visit_all(arg_data, s)([&](auto input, auto si) {
const auto* in_ptr = device_cast(input.data());
auto* ptr = result.cast<int64_t>();
gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
// fill all output to 0 first
idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
block_scan<block_size>(
idx,
sum{},
0,
elem_num,
[&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
[&](auto j, auto x) {
auto out_loc = x - 1;
if(float_equal(in_ptr[j], 0))
return;
auto index = si.multi(j);
for(size_t k = 0; k < index.size(); ++k)
{
ptr[k * elem_num + out_loc] = index[k];
}
});
});
});
return result;
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/device/prefix_scan_sum.hpp>
#include <migraphx/gpu/device/scan.hpp>
#include <migraphx/gpu/device/reduce_ops.hpp>
#include <migraphx/gpu/device/reduce.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace migraphx {
......@@ -8,29 +9,108 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis)
void prefix_scan_sum(hipStream_t stream,
const argument& result,
const argument& arg,
int32_t axis,
bool exclusive,
bool reverse)
{
const index_int block_size = 256;
const index_int n = arg.get_shape().lens()[axis];
auto rlens = result.get_shape().lens();
rlens[axis] = 1;
const index_int max_block_size = 256;
const index_int n = arg.get_shape().lens()[axis];
auto rlens = result.get_shape().lens();
rlens[axis] = 1;
hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
[=](auto output, auto input, auto rshape) {
gs_launch(stream, rshape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
const auto ridx = rshape.multi(i / block_size);
auto compute_idx = [&](auto j) {
auto k = ridx;
k[axis] = j;
return k;
};
block_scan<block_size>(idx,
sum{},
0,
n,
[&](auto j) { return input[compute_idx(j)]; },
[&](auto j, auto x) { output[compute_idx(j)] = x; });
});
const index_int block_size = compute_block_size(rshape.elements(), max_block_size);
if(reverse and exclusive)
{
gs_launch(stream, rshape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
const auto ridx = rshape.multi(i / block_size);
auto compute_idx = [&](auto j) {
auto k = ridx;
k[axis] = j;
return k;
};
block_scan<max_block_size>(
idx,
sum{},
0,
n,
reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
reverse_scan(n, [&](auto j, auto x) {
if(j == n - 1)
output[compute_idx(j)] = 0;
if(j > 0)
output[compute_idx(j - 1)] = x;
}));
});
}
else if(reverse)
{
gs_launch(stream, rshape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
const auto ridx = rshape.multi(i / block_size);
auto compute_idx = [&](auto j) {
auto k = ridx;
k[axis] = j;
return k;
};
block_scan<max_block_size>(
idx,
sum{},
0,
n,
reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
reverse_scan(n, [&](auto j, auto x) { output[compute_idx(j)] = x; }));
});
}
else if(exclusive)
{
gs_launch(stream, rshape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
const auto ridx = rshape.multi(i / block_size);
auto compute_idx = [&](auto j) {
auto k = ridx;
k[axis] = j;
return k;
};
block_scan<max_block_size>(
idx,
sum{},
0,
n,
[&](auto j) { return input[compute_idx(j)]; },
[&](auto j, auto x) {
auto k = j + 1;
if(j == 0)
output[compute_idx(0)] = 0;
if(k < n)
output[compute_idx(k)] = x;
});
});
}
else
{
gs_launch(stream, rshape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
const auto ridx = rshape.multi(i / block_size);
auto compute_idx = [&](auto j) {
auto k = ridx;
k[axis] = j;
return k;
};
block_scan<max_block_size>(
idx,
sum{},
0,
n,
[&](auto j) { return input[compute_idx(j)]; },
[&](auto j, auto x) { output[compute_idx(j)] = x; });
});
}
});
}
......
......@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
const index_int max_block_size = 256;
const index_int max_block_size = 128;
const index_int block_size = compute_block_size(batch_item_num, max_block_size);
gs_launch(stream,
batch_shape.elements() * block_size,
block_size)([=](auto i, auto idx) __device__ {
auto data_idx = batch.multi(i / block_size);
using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
type init = lowest();
auto batch_max = block_reduce<max_block_size>(
idx, max{}, init, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
return input[data_idx];
});
using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
type init = lowest();
if(axis == batch_lens.size() - 1)
{
gs_launch(stream, batch_shape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
auto start_loc = i / block_size * batch_item_num;
auto batch_max = block_reduce<max_block_size>(
idx, max{}, init, batch_item_num, [&](auto j) __device__ {
return input[start_loc + j];
});
auto batch_sum = block_reduce<max_block_size>(
idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
auto val = input[start_loc + j] - batch_max;
return ::exp(to_hip_type(val));
});
auto batch_sum =
block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
auto val = input[data_idx] - batch_max;
return ::exp(to_hip_type(val));
idx.local_stride(batch_item_num, [&](auto j) __device__ {
auto val = input[start_loc + j] - batch_max;
output[start_loc + j] = ::exp(to_hip_type(val)) / batch_sum;
});
});
}
else
{
gs_launch(stream, batch_shape.elements() * block_size, block_size)(
[=](auto i, auto idx) __device__ {
auto data_idx = batch.multi(i / block_size);
auto batch_max = block_reduce<max_block_size>(
idx, max{}, init, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
return input[data_idx];
});
idx.local_stride(batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
auto val = input[data_idx] - batch_max;
output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
});
});
auto batch_sum = block_reduce<max_block_size>(
idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
auto val = input[data_idx] - batch_max;
return ::exp(to_hip_type(val));
});
idx.local_stride(batch_item_num, [&](auto j) __device__ {
data_idx[axis] = j;
auto val = input[data_idx] - batch_max;
output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
});
});
}
});
}
......
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/topk.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/visit.hpp>
#include <migraphx/ranges.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class T, class Index, class Compare>
struct hip_heap_vector
{
MIGRAPHX_DEVICE_CONSTEXPR hip_heap_vector(T* val, index_int n, Index v_idx, Compare comp)
: data(val), size(n), data_index(v_idx), compare(comp)
{
make_heap(size);
}
MIGRAPHX_DEVICE_CONSTEXPR void try_push(const T val)
{
if(compare(val, data[data_index(0)]))
return;
pop_heap(size - 1);
data[data_index(size - 1)] = val;
push_heap(size - 1);
}
MIGRAPHX_DEVICE_CONSTEXPR void sort() { sort_heap(size); }
private:
MIGRAPHX_DEVICE_CONSTEXPR inline static void swap(T& v1, T& v2)
{
T v = v1;
v1 = v2;
v2 = v;
}
MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_down(index_int n, index_int index)
{
while(index < n)
{
auto pre_index = index;
index_int l = 2 * index + 1;
index_int r = 2 * index + 2;
if(l < n && compare(data[data_index(l)], data[data_index(index)]))
{
index = l;
}
if(r < n && compare(data[data_index(r)], data[data_index(index)]))
{
index = r;
if(compare(data[data_index(l)], data[data_index(r)]))
{
index = l;
}
}
if(index == pre_index)
{
break;
}
swap(data[data_index(index)], data[data_index(pre_index)]);
}
}
MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_up(index_int index)
{
while(index > 0)
{
auto parent_idx = (index - 1) / 2;
if(not compare(data[data_index(index)], data[data_index(parent_idx)]))
{
break;
}
swap(data[data_index(index)], data[data_index(parent_idx)]);
index = parent_idx;
}
}
MIGRAPHX_DEVICE_CONSTEXPR inline void make_heap(index_int n)
{
for(int j = n / 2 - 1; j >= 0; --j)
{
heapify_down(n, j);
}
}
MIGRAPHX_DEVICE_CONSTEXPR inline void push_heap(index_int loc) { heapify_up(loc); }
MIGRAPHX_DEVICE_CONSTEXPR inline void pop_heap(index_int loc)
{
swap(data[data_index(0)], data[data_index(loc)]);
heapify_down(loc, 0);
}
MIGRAPHX_DEVICE_CONSTEXPR inline void sort_heap(index_int n)
{
for(int j = n - 1; j > 0; --j)
{
swap(data[data_index(0)], data[data_index(j)]);
heapify_down(j, 0);
}
}
T* data = nullptr;
index_int size;
Index data_index;
Compare compare;
};
template <class T, class Index, class Compare>
__device__ hip_heap_vector<T, Index, Compare>
make_heap(T* data, index_int n, Index idx, Compare compare)
{
return {data, n, idx, compare};
}
template <class Compare>
std::vector<argument> topk(hipStream_t stream,
const argument& val_res,
const argument& ind_res,
const argument& arg,
int64_t k,
int64_t axis,
Compare compare)
{
auto in_s = arg.get_shape();
auto in_lens = in_s.lens();
auto out_s = val_res.get_shape();
auto axis_dim = in_s.lens()[axis];
auto comp_lens = in_lens;
comp_lens[axis] = 1;
shape comp_s{in_s.type(), comp_lens};
std::size_t elem_num = comp_s.elements();
hip_visit_all(val_res, arg, out_s, in_s, comp_s)(
[&](auto out_val, auto input, auto oss, auto iss, auto css) {
auto* data = device_cast(input.data());
auto* out = device_cast(out_val.data());
auto* const ind = ind_res.cast<int64_t>();
gs_launch(stream, elem_num)([=](auto i) __device__ {
auto idx = css.multi(i);
auto in_idx = [&](int ii) {
auto iidx = idx;
iidx[axis] = ii;
return iss.index(iidx);
};
auto out_idx = [&](int ii) {
auto iidx = idx;
iidx[axis] = ii;
return oss.index(iidx);
};
auto data_compare = [=](auto ii, auto jj) {
return compare(data[in_idx(ii)], data[in_idx(jj)]);
};
for(int j = 0; j < k; ++j)
{
ind[out_idx(j)] = j;
}
auto hp = make_heap(ind, k, out_idx, data_compare);
for(int j = k; j < axis_dim; ++j)
{
hp.try_push(j);
}
hp.sort();
for(int j = 0; j < k; ++j)
{
out[out_idx(j)] = data[in_idx(ind[out_idx(j)])];
}
});
});
return {val_res, ind_res};
}
argument topk_largest(hipStream_t stream,
const argument& val_res,
const argument& ind_res,
const argument& arg,
int64_t k,
int64_t axis)
{
return {topk(stream, val_res, ind_res, arg, k, axis, std::less<>{})};
}
argument topk_smallest(hipStream_t stream,
const argument& val_res,
const argument& ind_res,
const argument& arg,
int64_t k,
int64_t axis)
{
return {topk(stream, val_res, ind_res, arg, k, axis, std::greater<>{})};
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/device/where.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/launch.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class Shape>
constexpr auto get_rank(const Shape&)
{
return decltype(typename Shape::hip_index{}.size()){};
}
void where(hipStream_t stream,
const argument& result,
const argument& arg0,
const argument& arg1,
const argument& arg2)
{
hip_visit_all(result, arg1, arg2)([&](auto output, auto x, auto y) {
hip_visit_all(arg0)([&](auto cond) {
if constexpr(get_rank(cond.get_shape()) == get_rank(output.get_shape()))
{
gs_launch(stream, arg1.get_shape().elements())([=](auto idx) __device__ {
auto i = output.get_shape().multi(idx);
output[i] = cond[i] ? x[i] : y[i];
});
}
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
add_executable(gpu-driver
action.cpp
compile_pointwise.cpp
main.cpp
parser.cpp
perf.cpp
run_op.cpp
${GPU_DRIVER_SRCS}
)
target_include_directories(gpu-driver PRIVATE include)
target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
#include <migraphx/gpu/driver/action.hpp>
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
......@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
struct compile_pointwise : action<compile_pointwise>
struct compile_op : action<compile_op>
{
static void apply(const parser& p, const value& v)
{
context ctx;
auto inputs = p.parse_shapes(v.at("inputs"));
auto op = gpu::compile_pointwise(ctx, inputs, v.at("lambda").to<std::string>());
auto op = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
}
......
......@@ -2,6 +2,7 @@
#include <migraphx/json.hpp>
#include <migraphx/convert_to_json.hpp>
#include <migraphx/file_buffer.hpp>
#include <iostream>
using namespace migraphx; // NOLINT
using namespace migraphx::gpu; // NOLINT
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment