Commit d5bdfed0 authored by turneram's avatar turneram
Browse files

Merge remote-tracking branch 'origin/develop' into transformer-opts

parents 5ded4ac1 4ec8209f
...@@ -319,7 +319,7 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>> ...@@ -319,7 +319,7 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
shape compute_shape(const std::vector<shape>& inputs) const shape compute_shape(const std::vector<shape>& inputs) const
{ {
check_shapes{inputs, *this}.has(2); check_shapes{inputs, *this}.has(2);
auto s = inputs.at(0); const auto& s = inputs.at(0);
return {s.type(), s.lens()}; return {s.type(), s.lens()};
} }
argument argument
...@@ -357,7 +357,7 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>> ...@@ -357,7 +357,7 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
shape compute_shape(const std::vector<shape>& inputs) const shape compute_shape(const std::vector<shape>& inputs) const
{ {
check_shapes{inputs, *this}.has(3); check_shapes{inputs, *this}.has(3);
auto s = inputs.at(0); const auto& s = inputs.at(0);
return {s.type(), s.lens()}; return {s.type(), s.lens()};
} }
......
...@@ -223,7 +223,7 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>> ...@@ -223,7 +223,7 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
shape compute_shape(const std::vector<shape>& inputs) const shape compute_shape(const std::vector<shape>& inputs) const
{ {
check_shapes{inputs, *this}.has(1); check_shapes{inputs, *this}.has(1);
auto s = inputs.at(0); const auto& s = inputs.at(0);
return {s.type(), s.lens()}; return {s.type(), s.lens()};
} }
......
...@@ -93,7 +93,7 @@ add_library(migraphx_device ...@@ -93,7 +93,7 @@ add_library(migraphx_device
) )
add_library(compile_for_gpu INTERFACE) add_library(compile_for_gpu INTERFACE)
target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns) target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument) target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE) check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
if(HAS_HIP_LAMBDA_HOST_DEVICE) if(HAS_HIP_LAMBDA_HOST_DEVICE)
message(STATUS "Enable -fhip-lambda-host-device") message(STATUS "Enable -fhip-lambda-host-device")
......
...@@ -22,6 +22,7 @@ namespace gpu { ...@@ -22,6 +22,7 @@ namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
#if MIGRAPHX_USE_HIPRTC #if MIGRAPHX_USE_HIPRTC
...@@ -133,6 +134,7 @@ struct hiprtc_program ...@@ -133,6 +134,7 @@ struct hiprtc_program
std::vector<char> buffer(n); std::vector<char> buffer(n);
MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data())); MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
assert(buffer.back() == 0); assert(buffer.back() == 0);
// cppcheck-suppress returnDanglingLifetime
return {buffer.begin(), buffer.end() - 1}; return {buffer.begin(), buffer.end() - 1};
} }
...@@ -246,6 +248,16 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std ...@@ -246,6 +248,16 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
MIGRAPHX_THROW("Missing hsaco"); MIGRAPHX_THROW("Missing hsaco");
}; };
if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
{
for(const auto& src : srcs)
{
if(src.path.extension() != ".cpp")
continue;
std::cout << std::string(src.content.first, src.len()) << std::endl;
}
}
if(enabled(MIGRAPHX_GPU_DUMP_ASM{})) if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
{ {
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <migraphx/cpp_generator.hpp> #include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp> #include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp> #include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/stringutils.hpp> #include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp> #include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp> #include <migraphx/eliminate_common_subexpression.hpp>
...@@ -28,7 +29,8 @@ ${preamble} ...@@ -28,7 +29,8 @@ ${preamble}
extern "C" { extern "C" {
__global__ void kernel(${params}) __global__ void kernel(${params})
{ {
pointwise(${lambda}, ${args}); auto idx = make_index();
pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args});
} }
} }
...@@ -41,40 +43,105 @@ struct pointwise_compiler : compiler<pointwise_compiler> ...@@ -41,40 +43,105 @@ struct pointwise_compiler : compiler<pointwise_compiler>
{ {
std::vector<std::string> names() const { return {"pointwise"}; } std::vector<std::string> names() const { return {"pointwise"}; }
static std::size_t oversubscribe(const std::vector<shape>& inputs) static std::size_t oversubscribe_if(bool b)
{ {
if(std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); })) if(b)
return 1;
else
return 256; return 256;
else
return 1;
}
static std::size_t find_fast_axis(const std::vector<shape>& inputs)
{
auto permutation = find_permutation(inputs);
auto it = std::max_element(permutation.begin(), permutation.end());
return it - permutation.begin();
} }
static std::size_t vectorize_elements(const std::vector<shape>& inputs) static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
{ {
std::size_t n = inputs.front().elements(); const std::size_t max_lds_bytes = 4096;
std::vector<bool> result;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(result),
[&](const shape& input) { return input.strides()[axis] == 0; });
auto bytes = std::inner_product(inputs.begin(),
inputs.end(),
result.begin(),
std::size_t{0},
std::plus<>{},
[](const shape& s, bool b) -> std::size_t {
if(b)
return s.bytes();
return 0;
});
if(bytes < max_lds_bytes)
return result;
// TODO: Try to partially preload items
std::fill(result.begin(), result.end(), false);
return result;
}
static std::string preload_str(const std::vector<bool>& bs)
{
std::vector<std::string> bool_strs;
std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
if(b)
return "true";
return "false";
});
return "false, " + join_strings(bool_strs, ", ");
}
static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
{
// If all inputs is half then only use half2
if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) { if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
return s.packed() or s.broadcasted(); return s.type() == shape::half_type;
})) }))
{ return {2};
if((n % 4) == 0) return {4, 2};
return n / 4; }
else if((n % 2) == 0) static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
return n / 2; {
} auto sizes = vector_sizes(inputs);
return n; std::vector<std::size_t> max_vec_size;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(max_vec_size),
[&](const auto& input) -> std::size_t {
auto stride = input.strides()[axis];
auto len = input.lens()[axis];
if(stride != 0 and stride != 1)
return 1;
auto it = std::find_if(
sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
if(it != sizes.end())
return *it;
return 1;
});
return *std::min_element(max_vec_size.begin(), max_vec_size.end());
} }
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{ {
hip_compile_options options; hip_compile_options options;
options.set_launch_params(
v, compute_global_for(ctx, vectorize_elements(inputs), oversubscribe(inputs)));
options.inputs = inputs; options.inputs = inputs;
options.output = inputs.back(); options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs); options.virtual_inputs = reduce_dims(inputs);
options.params = "-Wno-float-equal"; options.params = "-Wno-float-equal";
auto src = interpolate_string(pointwise_kernel, auto axis = find_fast_axis(options.virtual_inputs);
auto vec_size = vectorize_elements(axis, options.virtual_inputs);
auto preloads = preload(axis, options.virtual_inputs);
auto is_preloading =
std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
options.set_launch_params(v,
compute_global_for(ctx,
options.output.elements() / vec_size,
oversubscribe_if(not is_preloading)));
auto src = interpolate_string(pointwise_kernel,
{{"params", enum_params(inputs.size(), "void * private_p")}, {{"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")}, {"args", enum_params(inputs.size(), "private_p")},
{"lambda", v.at("lambda").to<std::string>()}, {"lambda", v.at("lambda").to<std::string>()},
{"vec_size", std::to_string(vec_size)},
{"axis", std::to_string(axis)},
{"preloads", preload_str(preloads)},
{"preamble", v.get("preamble", std::string{})}}); {"preamble", v.get("preamble", std::string{})}});
return compile_hip_code_object(src, options); return compile_hip_code_object(src, options);
} }
......
...@@ -52,9 +52,8 @@ struct scatternd_compiler : compiler<scatternd_compiler> ...@@ -52,9 +52,8 @@ struct scatternd_compiler : compiler<scatternd_compiler>
{ {
hip_compile_options options; hip_compile_options options;
options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements())); options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
auto out_s = inputs.back();
options.inputs = inputs; options.inputs = inputs;
options.output = out_s; options.output = inputs.back();
options.kernel_name = "scatternd_kernel"; options.kernel_name = "scatternd_kernel";
options.virtual_inputs = inputs; options.virtual_inputs = inputs;
auto reduction = "assign_" + v.get("reduction", std::string{"none"}); auto reduction = "assign_" + v.get("reduction", std::string{"none"});
......
...@@ -3,6 +3,14 @@ ...@@ -3,6 +3,14 @@
#include <migraphx/kernels/integral_constant.hpp> #include <migraphx/kernels/integral_constant.hpp>
// NOLINTNEXTLINE
#define MIGRAPHX_RETURNS(...) \
->decltype(__VA_ARGS__) { return __VA_ARGS__; }
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
namespace migraphx { namespace migraphx {
struct swallow struct swallow
...@@ -161,6 +169,18 @@ constexpr auto pack(Ts... xs) ...@@ -161,6 +169,18 @@ constexpr auto pack(Ts... xs)
return [=](auto f) { return f(xs...); }; return [=](auto f) { return f(xs...); };
} }
template <class G, class F>
constexpr auto join(G g, F f)
{
return f([=](auto... xs) { return g(xs...); });
}
template <class G, class F, class... Fs>
constexpr auto join(G g, F f, Fs... fs)
{
return f([=](auto... xs) { return join([=](auto... ys) { return g(xs..., ys...); }, fs...); });
}
template <class Compare, class P1, class P2> template <class Compare, class P1, class P2>
constexpr auto pack_compare(Compare compare, P1 p1, P2 p2) constexpr auto pack_compare(Compare compare, P1 p1, P2 p2)
{ {
...@@ -191,39 +211,45 @@ constexpr auto arg(IntegralConstant ic) ...@@ -191,39 +211,45 @@ constexpr auto arg(IntegralConstant ic)
return arg_c<ic>(); return arg_c<ic>();
} }
inline constexpr auto rotate_last() template <class F>
constexpr auto make_transform(F f)
{ {
return [](auto... xs) { return [=](auto... xs) { return [=](auto g) { return f(g, xs...); }; };
return [=](auto&& f) {
return sequence_c<sizeof...(xs)>([&](auto... is) {
constexpr auto size = sizeof...(is);
return f(arg_c<(is + size - 1) % size>()(xs...)...);
});
};
};
} }
// An arg transformation takes the arguments and then a function to take the new arguments:
// transform(xs...)([](auto... ys) { ... })
// The transform_args function takes a list of transformations and continually applies them
template <class F> template <class F>
constexpr auto transform_args(F f) constexpr auto transform_args(F f)
{ {
return [=](auto... xs) { return f;
return [=](auto g) { return f(xs...)([&](auto... ys) { return g(ys...); }); };
};
} }
template <class F, class... Fs> template <class F, class... Fs>
constexpr auto transform_args(F f, Fs... fs) constexpr auto transform_args(F f, Fs... fs)
{ {
return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); }; return make_transform([=](auto g, auto... xs) {
return f(xs...)([=](auto... ys) { return transform_args(fs...)(ys...)(g); });
});
} }
// NOLINTNEXTLINE // identity transform
#define MIGRAPHX_RETURNS(...) \ inline constexpr auto transform_args()
->decltype(__VA_ARGS__) { return __VA_ARGS__; } {
return make_transform([](auto f, auto... xs) { return f(xs...); });
}
// NOLINTNEXTLINE // Rotate the first argument to the last argument
#define MIGRAPHX_LIFT(...) \ inline constexpr auto rotate_last()
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...)) {
return make_transform([](auto f, auto... xs) {
return sequence_c<sizeof...(xs)>([&](auto... is) {
constexpr auto size = sizeof...(is);
return f(arg_c<(is + size - 1) % size>()(xs...)...);
});
});
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
...@@ -38,20 +38,17 @@ constexpr implicit_conversion_op<T> implicit_conversion(T x) ...@@ -38,20 +38,17 @@ constexpr implicit_conversion_op<T> implicit_conversion(T x)
template <class F, class T, class... Ts> template <class F, class T, class... Ts>
__device__ void pointwise_tensor(index idx, F f, T out, Ts... xs) __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
{ {
preload<typename T::type>(idx, xs...)([&](auto... ps) { idx.global_stride(out.get_shape().elements(),
idx.global_stride(out.get_shape().elements(), [&](auto i) { out[i] = implicit_conversion(f(xs[i]...)); });
[&](auto i) { out[i] = implicit_conversion(f(ps[i]...)); });
});
} }
template <class F, class... Ts> template <class... Transforms>
__device__ void pointwise(F f, Ts*... ps) __device__ auto pointwise(index idx, Transforms... transforms)
{ {
auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize()); return [=](auto f, auto*... ps) {
t(ps...)([&](auto... xs) { auto t = transform_args(make_tensors(), rotate_last(), transforms...);
auto idx = make_index(); t(ps...)([&](auto... xs) { pointwise_tensor(idx, f, xs...); });
pointwise_tensor(idx, f, xs...); };
});
} }
} // namespace migraphx } // namespace migraphx
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp> #include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/vec.hpp>
namespace migraphx { namespace migraphx {
...@@ -73,7 +75,7 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs) ...@@ -73,7 +75,7 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
{ {
if constexpr(decltype(tensor_vec_size(x)){} == 0) if constexpr(decltype(tensor_vec_size(x)){} == 0)
{ {
auto v = vectorize(x); auto v = auto_vectorize(x);
auto b = as_vec(tensor_vec_size(v), buffer + offset); auto b = as_vec(tensor_vec_size(v), buffer + offset);
idx.local_stride(v.get_shape().element_space(), idx.local_stride(v.get_shape().element_space(),
[&](auto i) { b[i] = v.data()[i]; }); [&](auto i) { b[i] = v.data()[i]; });
...@@ -126,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs) ...@@ -126,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
}; };
} }
inline __device__ auto auto_preload(index idx)
{
return make_transform([=](auto f, auto out, auto... xs) {
preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
});
}
template <bool B, class T>
__device__ auto preload_copy(index idx, T x)
{
return [=](auto f) {
if constexpr(B)
{
using type = typename T::type;
constexpr auto size = get_shape_c<T>{}.element_space();
__shared__ type buffer[size];
// TODO: Always vecotrize when size > 4, and then use a second loop for remainder
constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
auto input = as_vec<n>(remove_bool(x.data()));
auto b = as_vec<n>(remove_bool(buffer));
idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
return f(x.with(buffer));
}
else
{
return f(x);
}
};
}
template <bool... Bs>
__device__ auto auto_preload(index idx)
{
return make_transform([=](auto f, auto... xs) {
auto invoke = [=](auto... ys) {
__syncthreads();
f(ys...);
};
join(invoke, preload_copy<Bs>(idx, xs)...);
});
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP #endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
...@@ -118,15 +118,13 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs) ...@@ -118,15 +118,13 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
} }
template <class T, class U, class V, class W, class Settings> template <class T, class U, class V, class W, class Settings>
__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t, Settings s) __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
{ {
auto index = make_index(); auto index = make_index();
const auto x = x_t.begin(); const auto x = x_t.begin();
const auto rois = rois_t.begin(); const auto rois = rois_t.begin();
const auto ind = ind_t.begin(); const auto ind = ind_t.begin();
auto out_ptr = y_t.begin();
// input shape // input shape
auto x_lens = x_t.get_shape().lens; auto x_lens = x_t.get_shape().lens;
auto channel_num = x_lens[1]; auto channel_num = x_lens[1];
...@@ -176,25 +174,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& ...@@ -176,25 +174,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]); const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
if constexpr(s.is_avg_pooling) if constexpr(s.is_avg_pooling)
{ {
out_ptr[i] = calc_pooling(offset_x, y_t[i] = calc_pooling(offset_x,
roi_starts, roi_starts,
bin_size, bin_size,
{ph, pw}, {ph, pw},
bin_grid_size, bin_grid_size,
in_dims, in_dims,
s.roi_offset, s.roi_offset,
avg_pool{}); avg_pool{});
} }
else else
{ {
out_ptr[i] = calc_pooling(offset_x, y_t[i] = calc_pooling(offset_x,
roi_starts, roi_starts,
bin_size, bin_size,
{ph, pw}, {ph, pw},
bin_grid_size, bin_grid_size,
in_dims, in_dims,
s.roi_offset, s.roi_offset,
max_pool{}); max_pool{});
} }
} }
} }
......
...@@ -61,10 +61,19 @@ constexpr auto common_vec_size() ...@@ -61,10 +61,19 @@ constexpr auto common_vec_size()
})(vec_size<Ts>()...); })(vec_size<Ts>()...);
} }
// Bools can not be used as a vector type so convert it to uint8
template <class T>
__device__ __host__ T* remove_bool(T* x)
{
return x;
}
inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
template <index_int N, class T> template <index_int N, class T>
__device__ __host__ auto as_vec(T* x) __device__ __host__ auto as_vec(T* x)
{ {
if constexpr(N == 0) if constexpr(N < 2)
return x; return x;
else else
return reinterpret_cast<vec<T, N>*>(x); return reinterpret_cast<vec<T, N>*>(x);
......
...@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis) ...@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis)
}); });
} }
// Bools can not be used as a vector type so convert it to uint8
template <class T>
__device__ __host__ T* remove_bool(T* x)
{
return x;
}
inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
template <index_int N, class T, class Axis> template <index_int N, class T, class Axis>
__device__ __host__ auto as_vec(T x, Axis axis) __device__ __host__ auto as_vec(T x, Axis axis)
{ {
if constexpr(N == 0) if constexpr(N < 2)
return x; return x;
else else
return make_tensor_view(as_vec<N>(remove_bool(x.data())), return make_tensor_view(as_vec<N>(remove_bool(x.data())),
...@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis) ...@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis)
template <index_int N, class T, class Axis> template <index_int N, class T, class Axis>
constexpr auto tensor_step(T x, Axis axis) constexpr auto tensor_step(T x, Axis axis)
{ {
if constexpr(N == 0) if constexpr(N < 2)
{ {
return x; return x;
} }
...@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred) ...@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred)
else if constexpr(decltype(pred(_c<2>)){}) else if constexpr(decltype(pred(_c<2>)){})
return _c<2>; return _c<2>;
else else
return _c<0>; return _c<1>;
} }
template <class T> template <class T>
__host__ __device__ auto vectorize(T x) __host__ __device__ auto auto_vectorize(T x)
{ {
if constexpr(tensor_vec_size<T>() == 0) if constexpr(tensor_vec_size<T>() == 0)
{ {
...@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs) ...@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
{ {
MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1); MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
MIGRAPHX_ASSERT(s.lens[axis] > 0); MIGRAPHX_ASSERT(s.lens[axis] > 0);
MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0); MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
if constexpr(s.strides[axis] == 0) if constexpr(s.strides[axis] == 0)
return tensor_step<n>(x, axis); return tensor_step<n>(x, axis);
else else
...@@ -215,7 +206,32 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs) ...@@ -215,7 +206,32 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
inline __device__ __host__ auto auto_vectorize() inline __device__ __host__ auto auto_vectorize()
{ {
return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; }; return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
}
template <index_int N, index_int Axis, class T>
__device__ __host__ auto vectorize_tensor(T x)
{
constexpr auto shape = get_shape_c<T>{};
if constexpr(shape.strides[Axis] == 0)
return tensor_step<N>(x, _c<Axis>);
else
return as_vec<N>(x, _c<Axis>);
}
template <index_int N, index_int Axis>
__device__ __host__ auto vectorize()
{
return make_transform([](auto f, auto... xs) {
if constexpr(N < 2)
{
f(xs...);
}
else
{
f(vectorize_tensor<N, Axis>(xs)...);
}
});
} }
} // namespace migraphx } // namespace migraphx
......
...@@ -505,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>> ...@@ -505,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>>
shape compute_shape(const std::vector<shape>& inputs) const shape compute_shape(const std::vector<shape>& inputs) const
{ {
check_shapes{inputs, *this}.has(1); check_shapes{inputs, *this}.has(1);
auto s = inputs.at(0); const auto& s = inputs.at(0);
return {s.type(), s.lens()}; return {s.type(), s.lens()};
} }
......
...@@ -3,23 +3,21 @@ ...@@ -3,23 +3,21 @@
#include <migraphx/migraphx.hpp> #include <migraphx/migraphx.hpp>
#include "test.hpp" #include "test.hpp"
TEST_CASE(add_op) TEST_CASE(add_literals)
{ {
migraphx::program p; migraphx::program p;
migraphx::module m = p.get_main_module(); migraphx::module m = p.get_main_module();
migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}}; migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
auto x = m.add_parameter("x", param_shape); std::vector<float> x_values(9, 1);
auto y = m.add_parameter("y", param_shape); auto x = m.add_literal(param_shape, x_values.data());
std::vector<float> y_values(9, -1);
auto y = m.add_literal(param_shape, y_values.data());
auto add_op = migraphx::operation("add"); auto add_op = migraphx::operation("add");
auto r = m.add_instruction(add_op, {x, y}); auto r = m.add_instruction(add_op, {x, y});
m.add_return({r}); m.add_return({r});
// run on ref target // run on ref target
p.compile(migraphx::target("ref")); p.compile(migraphx::target("ref"));
migraphx::program_parameters pp; migraphx::program_parameters pp;
std::vector<float> x_data(9, 1);
std::vector<float> y_data(9, -1);
pp.add("x", migraphx::argument(param_shape, x_data.data()));
pp.add("y", migraphx::argument(param_shape, y_data.data()));
auto outputs = p.eval(pp); auto outputs = p.eval(pp);
auto output = outputs[0]; auto output = outputs[0];
std::vector<float> expected(9, 0); std::vector<float> expected(9, 0);
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <migraphx/make_op.hpp> #include <migraphx/make_op.hpp>
#include <migraphx/generate.hpp> #include <migraphx/generate.hpp>
#include <migraphx/program.hpp> #include <migraphx/program.hpp>
#include <migraphx/par_for.hpp>
#include <migraphx/gpu/kernel.hpp> #include <migraphx/gpu/kernel.hpp>
#include <migraphx/gpu/target.hpp> #include <migraphx/gpu/target.hpp>
#include <migraphx/gpu/hip.hpp> #include <migraphx/gpu/hip.hpp>
...@@ -109,6 +110,24 @@ int main() {} ...@@ -109,6 +110,24 @@ int main() {}
)__migraphx__"; )__migraphx__";
// NOLINTNEXTLINE
const std::string math_template = R"__migraphx__(
#include <migraphx/kernels/pointwise.hpp>
#include <migraphx/kernels/math.hpp>
extern "C" {
__global__ void kernel(${type}* p)
{
auto x = *p;
*p = migraphx::implicit_conversion(migraphx::${invoke});
}
}
int main() {}
)__migraphx__";
migraphx::src_file make_src_file(const std::string& name, const std::string& content) migraphx::src_file make_src_file(const std::string& name, const std::string& content)
{ {
return {name, std::make_pair(content.data(), content.data() + content.size())}; return {name, std::make_pair(content.data(), content.data() + content.size())};
...@@ -248,4 +267,66 @@ TEST_CASE(compile_pointwise) ...@@ -248,4 +267,66 @@ TEST_CASE(compile_pointwise)
EXPECT(result == output_literal.get_argument()); EXPECT(result == output_literal.get_argument());
} }
TEST_CASE(compile_math)
{
std::vector<std::string> math_invoke = {
// clang-format off
"abs(x)",
"acos(x)",
"acosh(x)",
"asin(x)",
"asinh(x)",
"atan(x)",
"atanh(x)",
"ceil(x)",
"cos(x)",
"cosh(x)",
"erf(x)",
"exp(x)",
"floor(x)",
"isnan(x)",
"log(x)",
"max(x, x)",
"min(x, x)",
"pow(x, 0)",
"pow(x, x)",
"round(x)",
"rsqrt(x)",
"sin(x)",
"sinh(x)",
"sqrt(x)",
"tan(x)",
"tanh(x)",
"where(true, x, x)",
// clang-format on
};
std::vector<std::string> data_types;
auto vec_sizes = {2, 4, 6};
for(auto&& t : migraphx::shape::types())
{
if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
continue;
auto name = migraphx::shape::cpp_type(t);
if(t == migraphx::shape::half_type)
name.insert(0, "migraphx::");
data_types.push_back(name);
migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
});
}
migraphx::shape input{migraphx::shape::float_type, {5, 2}};
migraphx::gpu::hip_compile_options options;
options.global = 1024;
options.local = 1024;
options.inputs = {input};
options.output = input;
migraphx::par_for(math_invoke.size() * data_types.size(), 1, [&](auto i) {
const auto& t = data_types[i % data_types.size()];
const auto& invoke = math_invoke[i / data_types.size()];
auto src = migraphx::interpolate_string(math_template, {{"type", t}, {"invoke", invoke}});
auto co = migraphx::gpu::compile_hip_code_object(src, options);
(void)co;
});
}
int main(int argc, const char* argv[]) { test::run(argc, argv); } int main(int argc, const char* argv[]) { test::run(argc, argv); }
import migraphx import migraphx, array, sys
def create_buffer(t, data, shape):
a = array.array(t, data)
m = memoryview(a.tobytes())
return m.cast(t, shape)
def test_add_op(): def test_add_op():
p = migraphx.program() p = migraphx.program()
mm = p.get_main_module() mm = p.get_main_module()
param_shape = migraphx.shape(lens=[3, 3], type="float") x = mm.add_literal(create_buffer('f', [1.0] * 9, (3, 3)))
x = mm.add_parameter("x", param_shape) y = mm.add_literal(create_buffer('f', [2.0] * 9, (3, 3)))
y = mm.add_parameter("y", param_shape)
add_op = mm.add_instruction(migraphx.op("add"), [x, y]) add_op = mm.add_instruction(migraphx.op("add"), [x, y])
mm.add_return([add_op]) mm.add_return([add_op])
p.compile(migraphx.get_target("ref")) p.compile(migraphx.get_target("ref"))
params = {} params = {}
params["x"] = migraphx.generate_argument(param_shape)
params["y"] = migraphx.generate_argument(param_shape)
output = p.run(params)[-1].tolist() output = p.run(params)[-1].tolist()
assert output == [ assert output == list([3.0] * 9)
a + b for a, b in zip(params["x"].tolist(), params["y"].tolist())
]
def test_if_then_else(): def test_if_then_else():
...@@ -60,5 +61,6 @@ def test_if_then_else(): ...@@ -60,5 +61,6 @@ def test_if_then_else():
if __name__ == "__main__": if __name__ == "__main__":
test_add_op() if sys.version_info >= (3, 0):
test_add_op()
test_if_then_else() test_if_then_else()
import migraphx, sys
try:
import numpy as np
except:
sys.exit()
def test_add_op():
p = migraphx.program()
mm = p.get_main_module()
x = mm.add_literal(np.ones((3, 3), dtype='float32'))
y = mm.add_literal(2 * np.ones((3, 3), dtype='float32'))
add_op = mm.add_instruction(migraphx.op("add"), [x, y])
mm.add_return([add_op])
p.compile(migraphx.get_target("ref"))
params = {}
output = p.run(params)[-1].tolist()
assert output == list(3 * np.ones((9), dtype='float32'))
if __name__ == "__main__":
test_add_op()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment