Commit 4a39a0f7 authored by Shucai Xiao's avatar Shucai Xiao
Browse files

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into add-conv_bn_add-test

parents 5564172e bb827865
#ifndef MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#define MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
namespace migraphx {
template <class T, class... Shapes>
constexpr auto traverse_preload(Shapes... ss)
{
return [=](auto f, auto... g) {
index_int offset = 0;
auto each = [&](auto x) {
constexpr auto s = decltype(x.get_shape()){};
constexpr auto size = _c<s.element_space()>;
if constexpr(not s.broadcasted())
return f(x, offset, false_type{});
else if constexpr((s.elements() - size) < 64)
return f(x, offset, false_type{});
else
{
auto pre_offset = offset;
offset += size;
offset += offset % 4;
return f(x, pre_offset, true_type{});
}
};
return by(each, g...)(ss...);
};
}
template <class T, class... Shapes>
constexpr index_int compute_preload_size(Shapes...)
{
index_int size = 0;
traverse_preload<T>(Shapes{}...)(
[&](auto s, auto offset, auto) { size = offset + s.element_space(); });
return size;
}
template <class F, class T, class... Ts>
__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
{
auto invoke = [&](auto... ys) {
__syncthreads();
f(ys...);
};
traverse_preload<T>(xs...)(
[&](auto x, auto offset, auto copy) {
if constexpr(copy)
{
auto v = vectorize(x);
auto b = as_vec(tensor_vec_size(v), buffer + offset);
idx.local_stride(v.get_shape().element_space(),
[&](auto i) { b[i] = v.data()[i]; });
return x.with(buffer + offset);
}
else
{
return x;
}
},
invoke);
}
template <class T>
struct remove_vec
{
using type = T;
};
template <class T, index_int N>
struct remove_vec<vec<T, N>>
{
using type = T;
};
template <class T, class... Ts>
__device__ auto preload(index idx, Ts... xs)
{
using type = typename remove_vec<T>::type;
constexpr auto size = compute_preload_size<type>(xs.get_shape()...);
const index_int max_size = 512 * sizeof(type);
return [=](auto f) {
if constexpr(size > 0 and size < max_size)
{
__shared__ type buffer[size];
preload_copy(idx, f, buffer, xs...);
}
else
{
f(xs...);
}
};
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
#include <hip/hip_runtime.h>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/algorithm.hpp>
namespace migraphx {
template <class F, class G>
struct on_exit
{
F f;
G g;
template <class T>
__host__ __device__ auto operator()(T x) const
{
return f(x);
}
__host__ __device__ ~on_exit() { f(g); }
};
template <class PrivateMIGraphXTypeNameProbe>
constexpr auto print_type_name_probe()
{
constexpr auto name = __PRETTY_FUNCTION__;
constexpr auto size = sizeof(__PRETTY_FUNCTION__);
constexpr auto parameter_name = "PrivateMIGraphXTypeNameProbe = ";
constexpr auto parameter_name_size = sizeof("PrivateMIGraphXTypeNameProbe = ") - 1;
constexpr auto begin =
search(name, name + size, parameter_name, parameter_name + parameter_name_size);
static_assert(begin < name + size, "Type probe not found.");
constexpr auto start = begin + parameter_name_size;
constexpr auto last = find_if(start, name + size, [](auto c) { return c == ']' or c == ';'; });
return [=](const auto& s) { s.print_string(start, last - start); };
}
template <class T>
struct type_printer
{
template <class Stream>
friend constexpr const Stream& operator<<(const Stream& s, type_printer)
{
print_type_name_probe<T>()(s);
return s;
}
};
template <class T>
constexpr type_printer<T> type_of()
{
return {};
}
template <class T>
constexpr type_printer<T> type_of(T)
{
return {};
}
template <class T>
constexpr type_printer<typename T::type> sub_type_of()
{
return {};
}
template <class T>
constexpr type_printer<typename T::type> sub_type_of(T)
{
return {};
}
template <class F>
struct basic_printer
{
F f;
__host__ __device__ const basic_printer& print_long(long value) const
{
f([&] { printf("%li", value); });
return *this;
}
__host__ __device__ const basic_printer& print_ulong(unsigned long value) const
{
f([&] { printf("%lu", value); });
return *this;
}
__host__ __device__ const basic_printer& print_char(char value) const
{
f([&] { printf("%c", value); });
return *this;
}
__host__ __device__ const basic_printer& print_string(const char* value) const
{
f([&] { printf("%s", value); });
return *this;
}
__host__ __device__ const basic_printer& print_string(const char* value, int size) const
{
f([&] { printf("%.*s", size, value); });
return *this;
}
__host__ __device__ const basic_printer& print_double(double value) const
{
f([&] { printf("%f", value); });
return *this;
}
__host__ __device__ const basic_printer& print_bool(bool value) const
{
f([&] {
if(value)
printf("true");
else
printf("false");
});
return *this;
}
__host__ __device__ const basic_printer& operator<<(short value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned short value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(int value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned int value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(long value) const
{
return print_long(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned long value) const
{
return print_ulong(value);
}
__host__ __device__ const basic_printer& operator<<(float value) const
{
return print_double(value);
}
__host__ __device__ const basic_printer& operator<<(double value) const
{
return print_double(value);
}
__host__ __device__ const basic_printer& operator<<(bool value) const
{
return print_bool(value);
}
__host__ __device__ const basic_printer& operator<<(char value) const
{
return print_char(value);
}
__host__ __device__ const basic_printer& operator<<(unsigned char value) const
{
return print_char(value);
}
__host__ __device__ const basic_printer& operator<<(const char* value) const
{
return print_string(value);
}
};
template <class F>
constexpr basic_printer<F> make_printer(F f)
{
return {f};
}
template <class F, class G>
constexpr basic_printer<on_exit<F, G>> make_printer(F f, G g)
{
return {{f, g}};
}
inline __device__ auto cout()
{
return make_printer([](auto f) { f(); });
}
inline __device__ auto coutln()
{
return make_printer([](auto f) { f(); }, [] { printf("\n"); });
}
template <class F, class... Ts>
__device__ void print_each(F f, Ts... xs)
{
each_args([&](auto x) { f() << x; }, xs...);
}
template <class F, class... Ts>
__device__ void print_each_once(F f, Ts... xs)
{
auto idx = make_index();
if(idx.global == 0)
print_each(f, xs...);
}
template <class... Ts>
__device__ void print(Ts... xs)
{
print_each(&cout, xs...);
}
template <class... Ts>
__device__ void print_once(Ts... xs)
{
print_each_once(&cout, xs...);
}
template <class... Ts>
__device__ void println(Ts... xs)
{
print_each(&coutln, xs...);
}
template <class... Ts>
__device__ void println_once(Ts... xs)
{
print_each_once(&coutln, xs...);
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRINT_HPP
......@@ -19,7 +19,7 @@ struct shape
constexpr index_int elements() const { return lens.product(); }
constexpr index_int element_space() const { return strides.dot(lens - 1); }
constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }
constexpr bool packed() const { return elements() == element_space(); }
constexpr bool broadcasted() const { return strides.product() == 0; }
......@@ -92,6 +92,15 @@ struct shape
result[0] = tidx;
return result;
}
constexpr shape get_shape() const { return *this; }
template <class Stream>
friend constexpr const Stream& operator<<(const Stream& ss, const shape& s)
{
ss << "{" << s.lens << "}, {" << s.strides << "}";
return ss;
}
};
template <class Lens, class Strides>
......
......@@ -2,18 +2,22 @@
#define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
#include <migraphx/kernels/shape.hpp>
#include <migraphx/kernels/debug.hpp>
namespace migraphx {
template <class T, class Shape>
struct tensor_view
{
using type = T;
constexpr Shape get_shape() const { return Shape{}; }
constexpr index_int size() const { return get_shape().elements(); }
template <class U>
constexpr T& operator[](U i) const
{
MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
return x[get_shape().index(i)];
}
......@@ -22,6 +26,13 @@ struct tensor_view
constexpr T* begin() const { return data(); }
constexpr T* end() const { return data() + size(); }
template <class U>
constexpr tensor_view<U, Shape> with(U* y) const
{
static_assert(sizeof(T) == sizeof(U), "Not the same size");
return {y};
}
T* x;
};
......
......@@ -9,6 +9,9 @@ using index_int = std::uint32_t;
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template <class T, index_int N>
using vec = T __attribute__((ext_vector_type(N)));
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_KERNELS_VEC_HPP
#define MIGRAPHX_GUARD_KERNELS_VEC_HPP
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
namespace migraphx {
template <class T, index_int N>
constexpr auto vec_size(vec<T, N>)
{
return index_constant<N>{};
}
template <class T>
constexpr auto vec_size(T, ...)
{
return index_constant<0>{};
}
template <class T>
constexpr auto vec_size()
{
return decltype(vec_size(T{})){};
}
template <index_int N, class T>
__device__ __host__ auto as_vec(T* x)
{
if constexpr(N == 0)
return x;
else
return reinterpret_cast<vec<T, N>*>(x);
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/vec.hpp>
namespace migraphx {
template <class T>
constexpr auto tensor_vec_size(T)
{
return vec_size<typename T::type>();
}
template <index_int N, class Shape>
constexpr auto as_vec_shape(Shape s)
{
auto lens = transform(s.lens, s.strides, [](auto len, auto stride) {
if(stride == 1)
return len / N;
else
return len;
});
auto strides = transform(s.strides, [](auto stride) {
if(stride == 1)
return stride;
return stride / N;
});
MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
return make_shape(lens, strides);
}
template <index_int N, class T>
__device__ __host__ auto as_vec(T x)
{
if constexpr(N == 0)
return x;
else
return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
}
template <index_int N, class T, class Axis>
constexpr auto tensor_step(T x, Axis)
{
if constexpr(N == 0)
{
return x;
}
else
{
constexpr auto s = decltype(x.get_shape()){};
MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
return sequence(x.get_shape().lens.size(), [&](auto... is) {
auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
constexpr auto axis = Axis{};
if(j == axis)
return i / N;
else
return i;
});
return make_tensor_view(x.data(), make_shape(lens, s.strides));
});
}
}
template <class IntegralConstant, class T>
__device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
{
return as_vec<ic>(x);
}
template <class... Shapes>
constexpr index_int find_vector_axis(Shapes... ss)
{
index_int axis = 0;
bool b = false;
by([&](auto s) {
if(s.broadcasted() or b)
return;
auto it = find(s.strides.begin(), s.strides.end(), 1);
if(it == s.strides.end())
return;
axis = it - s.strides.begin();
b = true;
})(ss...);
return axis;
}
template <index_int N, class Axis, class... Shapes>
constexpr auto is_vectorizable(Axis axis, Shapes... ss)
{
return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and
...);
}
template <index_int N, class... Shapes>
constexpr bool is_vectorizable(Shapes... ss)
{
return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...);
}
template <class P>
constexpr auto find_vectorize_size(P pred)
{
if constexpr(pred(_c<4>))
return _c<4>;
else if constexpr(pred(_c<2>))
return _c<2>;
else
return _c<0>;
}
template <class T>
__host__ __device__ auto vectorize(T x)
{
if constexpr(vec_size<T>() == 0)
{
constexpr auto n =
find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
return as_vec<n>(x);
}
else
{
return x;
}
}
inline __device__ __host__ auto auto_vectorize()
{
return [](auto... xs) {
return [=](auto f) {
// TODO: Just check there a single axis of 1
constexpr bool packed_or_broadcasted =
((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
if constexpr(packed_or_broadcasted)
{
constexpr auto axis = find_vector_axis(xs.get_shape()...);
constexpr auto n = find_vectorize_size(
[&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
by(
[&](auto x) {
constexpr auto s = x.get_shape();
if constexpr(s.strides[axis] == 0)
return tensor_step<n>(x, axis);
else
return as_vec<n>(x);
},
f)(xs...);
}
else
{
f(xs...);
}
};
};
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
#include <migraphx/run_loop.hpp>
#include <migraphx/gpu/loop.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/fill.hpp>
#include <unordered_map>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
{
auto input_num = (inputs.size() - 2) / 2;
inputs.erase(inputs.begin() + input_num, inputs.end());
return op.compute_shape(inputs, std::move(mods));
}
struct gpu_loop
{
int64_t max_iterations = 0;
template <class T>
void copy(context& ctx, const argument& src, T& dst) const
{
argument arg_dst{src.get_shape(), &dst};
copy_from_gpu(ctx, src, arg_dst);
}
template <class T>
void copy(context& ctx, T src, const argument& dst) const
{
argument arg_src{dst.get_shape(), &src};
copy_to_gpu(ctx, arg_src, dst);
}
void append(const std::vector<argument>&, const std::vector<argument>&, int) const {}
void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
{
if(iter >= max_iterations)
return;
auto elem_num = max_iterations - iter;
for(const auto& out : concatenated_outputs)
{
auto s = out.get_shape();
auto size = s.bytes() / max_iterations;
auto lens = s.lens();
lens[0] = elem_num;
shape ss{s.type(), lens};
assert(ss.bytes() + iter * size <= out.get_shape().bytes());
device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
}
}
std::unordered_map<std::string, int> get_output_params(const module& m) const
{
auto get_output_index = [](const std::string& name) {
std::string out_prefix = "#output_";
auto loc = name.find(out_prefix);
if(loc != std::string::npos)
{
int index = std::stoi(name.substr(loc + out_prefix.size()));
return index;
}
return -1;
};
const auto& param_names = m.get_parameter_names();
std::unordered_map<std::string, int> result;
for(const auto& name : param_names)
{
auto index = get_output_index(name);
if(index == -1)
continue;
result[name] = index;
}
return result;
}
};
argument
hip_loop::compute(context& ctx,
const shape&,
const std::vector<argument>& args,
const std::vector<module_ref>& mods,
const std::function<std::vector<argument>(
module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
{
return run_loop(gpu_loop{op.max_iterations}, ctx, args, mods, run);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <iterator>
#include <migraphx/gpu/lowering.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
......@@ -37,6 +38,7 @@
#include <migraphx/gpu/quant_convolution.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/unary_not.hpp>
#include <migraphx/gpu/where.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
#include <utility>
......@@ -55,7 +57,8 @@ struct miopen_apply
std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
instruction_ref last{};
std::unordered_map<instruction_ref, std::string> prog_output_names{};
bool offload_copy = false;
bool offload_copy = false;
bool int8_x4_format = true;
context& get_context() const
{
......@@ -97,6 +100,13 @@ struct miopen_apply
assert(mod != nullptr);
assert(pass != nullptr);
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
auto& ctx = get_context();
rocblas_gemm_flags flag;
rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
#endif
offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
create_output_names();
......@@ -141,6 +151,7 @@ struct miopen_apply
add_generic_op("sub");
add_generic_op("tan");
add_generic_op("tanh");
add_generic_op("where");
add_extend_op("abs");
add_extend_op("argmax");
......@@ -153,26 +164,33 @@ struct miopen_apply
add_extend_op("leaky_relu");
add_extend_op("logsoftmax");
add_extend_op("lrn");
add_extend_op("multinomial");
add_extend_op("nonzero");
add_extend_op("pad");
add_extend_op("pooling");
add_extend_op("prefix_scan_sum");
add_extend_op("reduce_max");
add_extend_op("reduce_mean");
add_extend_op("reduce_min");
add_extend_op("reduce_prod");
add_extend_op("reduce_sum");
add_extend_op("reverse");
add_extend_op("rnn_var_sl_last_output");
add_extend_op("rnn_var_sl_shift_output");
add_extend_op("rnn_var_sl_shift_sequence");
add_extend_op("scatter");
add_extend_op("softmax");
add_extend_op("topk");
add_gemm_op<op::dot>("dot");
add_gemm_op<op::quant_dot>("quant_dot");
add_batch_norm_inference_op();
add_convolution_op();
add_deconvolution_op();
add_quant_convolution_op();
add_batch_norm_inference_op();
add_neg_op();
add_gemm_op<op::dot>("dot");
add_gemm_op<op::quant_dot>("quant_dot");
add_if_op();
add_loop_op();
add_neg_op();
add_quant_convolution_op();
}
void copy_params()
......@@ -185,6 +203,10 @@ struct miopen_apply
if(ins->name() != "@param")
continue;
// parameter no outputs, no need to insert copy to gpu
if(ins->outputs().empty())
continue;
auto pos = std::next(ins);
auto a = insert_allocation(pos, ins->get_shape());
auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
......@@ -283,17 +305,14 @@ struct miopen_apply
});
}
template <class Op>
void add_gemm_op(std::string name)
template <typename Op>
void add_gemm_op(const std::string& name)
{
apply_map.emplace(name, [=](instruction_ref ins) {
auto&& op = any_cast<Op>(ins->get_operator());
auto beta = op.beta;
std::vector<instruction_ref> refs = ins->inputs();
if(refs.size() == 2)
{
auto output = insert_allocation(ins, ins->get_shape());
beta = 0;
refs.push_back(output);
}
else
......@@ -312,8 +331,8 @@ struct miopen_apply
refs.push_back(refs.back());
}
}
return mod->replace_instruction(ins, rocblas_gemm<Op>{Op{op.alpha, beta}}, refs);
return mod->replace_instruction(
ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format}, refs);
});
}
......@@ -410,7 +429,7 @@ struct miopen_apply
});
}
// replace the if operator with gpu_if operator
// add input and output argument for the if operator
void add_if_op()
{
apply_map.emplace("if", [=](instruction_ref ins) {
......@@ -449,9 +468,48 @@ struct miopen_apply
return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
});
}
// replace the loop operator with gpu_loop operator
void add_loop_op()
{
apply_map.emplace("loop", [=](instruction_ref ins) {
std::vector<instruction_ref> inputs = ins->inputs();
// copy max_iter from gpu to cpu
auto cpu_max_iter =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
auto cpu_cond =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
auto synced_max_iter =
mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
inputs.at(0) = synced_max_iter;
inputs.at(1) = cpu_cond;
auto copy_inputs = inputs;
std::transform(
copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
return mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
});
auto mod_args = ins->module_inputs();
auto output = insert_allocation(ins, ins->get_shape());
const auto* sub_mod = mod_args.front();
auto cond_out = mod->insert_instruction(
ins,
make_op("hip::allocate",
{{"shape", to_value(sub_mod->get_output_shapes().front())}}));
// add cond and mod outputs to the argument list
inputs.push_back(cond_out);
inputs.push_back(output);
return mod->replace_instruction(
ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
});
}
};
void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/multinomial.hpp>
#include <migraphx/gpu/device/multinomial.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/tune_axis.hpp>
#include <migraphx/check_shapes.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
{
check_shapes{inputs, *this}.has(3).only_dims(2).standard();
inputs.pop_back();
return op.compute_shape(inputs);
}
argument
hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
return args.back();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/nonzero.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/nonzero.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
{
return op.compute_shape({inputs.front()});
}
argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <iterator>
#include <migraphx/gpu/pack_int8_args.hpp>
#include <migraphx/gpu/int8_gemm_pack.hpp>
#include <migraphx/gpu/int8_conv_pack.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/instruction_ref.hpp>
#include <migraphx/program.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/permutation.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
void pack_int8_args::apply(module& p) const
static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
{
for(auto ins : iterator_for(p))
auto s = ins->get_shape();
auto lens = s.lens();
auto k = lens[lens.size() + offset];
auto pad_k = (k + 3) / 4 * 4;
auto pad_lens = lens;
pad_lens[lens.size() + offset] = pad_k;
std::vector<int64_t> pad_dims(lens.size() * 2, 0);
auto ret_ins = ins;
if(pad_k != k)
{
pad_dims[lens.size() + offset] = pad_k - k;
shape ps{s.type(), pad_lens};
auto ins_out =
m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(ps)}}));
auto pad = make_op("pad", {{"pads", pad_dims}});
ret_ins =
m.insert_instruction(std::next(ins), make_op("gpu::pad", pad.to_value()), ins, ins_out);
}
return ret_ins;
}
static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
{
std::vector<instruction_ref> ret_inputs;
auto inputs = ins->inputs();
auto in0 = inputs.at(0);
auto sa = in0->get_shape();
bool transa = sa.transposed();
if(transa)
{
auto perm = find_permutation(sa);
auto val = in0->get_operator().to_value();
if(val.contains("dims"))
{
int offset = static_cast<int>(perm.back()) - static_cast<int>(perm.size());
auto t_in = in0->inputs().front();
auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>();
auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in);
}
else
{
shape cs{in0->get_shape().type(), in0->get_shape().lens()};
auto con_out =
m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(cs)}}));
auto cin0 = m.insert_instruction(ins, make_op("gpu::contiguous"), in0, con_out);
ret_inputs.push_back(pad_ins(m, cin0, -1));
}
}
else
{
ret_inputs.push_back(pad_ins(m, in0, -1));
}
auto in1 = inputs.at(1);
auto sb = in1->get_shape();
bool transb = sb.transposed();
if(transb)
{
auto perm = find_permutation(sb);
auto val = in1->get_operator().to_value();
if(val.contains("dims"))
{
int offset = static_cast<int>(perm[perm.size() - 2]) - static_cast<int>(perm.size());
auto t_in = in1->inputs().front();
auto p_in = pad_ins(m, t_in, offset);
auto dims = val.at("dims").to_vector<int64_t>();
auto r_in =
m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
ret_inputs.push_back(r_in);
}
else
{
shape cs{in1->get_shape().type(), in1->get_shape().lens()};
auto con_out =
m.insert_instruction(ins, make_op("hip::allocate", {{"shape", to_value(cs)}}));
auto cin1 = m.insert_instruction(ins, make_op("gpu::contiguous"), in1, con_out);
ret_inputs.push_back(pad_ins(m, cin1, -2));
}
}
else
{
ret_inputs.push_back(pad_ins(m, in1, -2));
}
std::copy(inputs.begin() + 2, inputs.end(), std::back_inserter(ret_inputs));
return ret_inputs;
}
void pack_int8_args::apply(module& m) const
{
for(auto ins : iterator_for(m))
{
if(ins->name() == "gpu::quant_gemm")
{
auto val = ins->get_operator().to_value();
assert(val.contains("int8_x4_format"));
if(not val.at("int8_x4_format").to<bool>())
{
return;
}
auto inputs = ins->inputs();
auto lens = inputs.at(0)->get_shape().lens();
// gemm need the k to be multiple of 4, so need packing that dimension
auto old_inputs = inputs;
if((lens.back() % 4) != 0)
{
inputs = pad_inputs(m, ins);
}
bool transa = inputs[0]->get_shape().transposed();
bool transb = inputs[1]->get_shape().transposed();
if(!transb)
{
auto packed_b = p.insert_instruction(ins, hip_allocate{inputs[1]->get_shape()});
auto output_b =
p.insert_instruction(ins, hip_int8_gemm_pack_a{}, {inputs[1], packed_b});
instruction::replace_argument(ins, inputs[1], output_b);
auto packed_b = m.insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(inputs[1]->get_shape())}}));
auto output_b = m.insert_instruction(
ins, make_op("gpu::int8_gemm_pack_a"), {inputs[1], packed_b});
inputs[1] = output_b;
}
if(transa)
{
auto packed_a = p.insert_instruction(ins, hip_allocate{inputs[0]->get_shape()});
auto output_a =
p.insert_instruction(ins, hip_int8_gemm_pack_b{}, {inputs[0], packed_a});
instruction::replace_argument(ins, inputs[0], output_a);
auto packed_a = m.insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(inputs[0]->get_shape())}}));
auto output_a = m.insert_instruction(
ins, make_op("gpu::int8_gemm_pack_b"), {inputs[0], packed_a});
inputs[0] = output_a;
}
if(inputs != old_inputs)
{
m.replace_instruction(ins, ins->get_operator(), inputs);
}
}
else if(ins->name() == "gpu::quant_convolution")
{
auto inputs = ins->inputs();
auto packed_x =
p.insert_instruction(ins, hip_allocate{pack_int8_shape(inputs[0]->get_shape())});
auto inputs = ins->inputs();
auto packed_x = m.insert_instruction(
ins,
make_op("hip::allocate",
{{"shape", to_value(pack_int8_shape(inputs[0]->get_shape()))}}));
auto output_x =
p.insert_instruction(ins, miopen_int8_conv_pack{}, {inputs[0], packed_x});
m.insert_instruction(ins, make_op("gpu::int8_conv_pack"), {inputs[0], packed_x});
instruction::replace_argument(ins, inputs[0], output_x);
auto packed_w =
p.insert_instruction(ins, hip_allocate{pack_int8_shape(inputs[1]->get_shape())});
auto packed_w = m.insert_instruction(
ins,
make_op("hip::allocate",
{{"shape", to_value(pack_int8_shape(inputs[1]->get_shape()))}}));
auto output_w =
p.insert_instruction(ins, miopen_int8_conv_pack{}, {inputs[1], packed_w});
m.insert_instruction(ins, make_op("gpu::int8_conv_pack"), {inputs[1], packed_w});
instruction::replace_argument(ins, inputs[1], output_w);
}
}
......
......@@ -10,7 +10,7 @@ shape miopen_pooling::compute_shape(const std::vector<shape>& inputs) const
check_shapes{inputs, *this}.has(2).standard();
std::vector<shape> pooling_input = {inputs.at(0)};
check_shapes{pooling_input, *this}.max_ndims(5);
return op.compute_shape(pooling_input);
return op.normalize_compute_shape(pooling_input);
}
inline void reshape_if_1d(shape& input)
......
......@@ -10,7 +10,7 @@ namespace gpu {
shape miopen_quant_convolution::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(4).standard();
return op.compute_shape({inputs.at(0), inputs.at(1)});
return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
}
argument miopen_quant_convolution::compute(context& ctx,
const shape& output_shape,
......
#include <migraphx/gpu/reverse.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/reverse.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_reverse::compute_shape(std::vector<shape> inputs) const
{
inputs.pop_back();
return op.normalize_compute_shape(inputs);
}
argument hip_reverse::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
return device::reverse(ctx.get_stream().get(), args.back(), args[0], op.axes);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/scatter.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/scatter.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_scatter::compute_shape(std::vector<shape> inputs) const
{
inputs.pop_back();
return op.normalize_compute_shape(inputs);
}
argument hip_scatter::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
return device::scatter(ctx.get_stream().get(), args.back(), args[0], args[1], args[2], op.axis);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -2,7 +2,6 @@
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/check_context.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/decompose.hpp>
#include <migraphx/eliminate_allocation.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/eliminate_concat.hpp>
......@@ -10,16 +9,20 @@
#include <migraphx/eliminate_data_type.hpp>
#include <migraphx/eliminate_identity.hpp>
#include <migraphx/eliminate_pad.hpp>
#include <migraphx/inline_module.hpp>
#include <migraphx/insert_pad.hpp>
#include <migraphx/memory_coloring.hpp>
#include <migraphx/normalize_ops.hpp>
#include <migraphx/preallocate_param.hpp>
#include <migraphx/propagate_constant.hpp>
#include <migraphx/register_target.hpp>
#include <migraphx/remap.hpp>
#include <migraphx/rewrite_batchnorm.hpp>
#include <migraphx/rewrite_pooling.hpp>
#include <migraphx/rewrite_quantization.hpp>
#include <migraphx/rewrite_rnn.hpp>
#include <migraphx/schedule.hpp>
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/concat_gpu_opt.hpp>
......@@ -29,7 +32,6 @@
#include <migraphx/gpu/lowering.hpp>
#include <migraphx/gpu/mlir_conv.hpp>
#include <migraphx/gpu/pack_int8_args.hpp>
#include <migraphx/gpu/preallocate_param.hpp>
#include <migraphx/gpu/schedule_model.hpp>
#include <migraphx/gpu/sync_device.hpp>
#include <migraphx/gpu/target.hpp>
......@@ -50,21 +52,27 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
unsupported_types.erase(shape::type_t::bool_type);
unsupported_types.erase(shape::type_t::int8_type);
unsupported_types.erase(shape::type_t::uint8_type);
unsupported_types.erase(shape::type_t::tuple_type);
// clang-format off
return
{
normalize_ops{},
decompose{},
dead_code_elimination{},
simplify_qdq{},
rewrite_quantization{},
dead_code_elimination{},
eliminate_data_type{unsupported_types, shape::type_t::float_type},
simplify_reshapes{},
eliminate_identity{},
eliminate_pad{},
dead_code_elimination{},
insert_pad{},
dead_code_elimination{},
rewrite_batchnorm{},
dead_code_elimination{},
rewrite_rnn{},
dead_code_elimination{},
inline_module{},
rewrite_pooling{},
dead_code_elimination{},
eliminate_common_subexpression{},
......@@ -82,17 +90,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{},
eliminate_concat{concat_gpu_optimization{}},
dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
pack_int8_args{},
dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
fuse_ops{&ctx, options.fast_math},
dead_code_elimination{},
write_literals{&ctx},
schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
memory_coloring{"hip::allocate"},
sync_device{},
preallocate_param{"scratch", &ctx},
preallocate_param{"scratch", gpu_allocation_model{}},
dead_code_elimination{},
eliminate_workspace{},
eliminate_allocation{"hip::allocate"},
......
#include <migraphx/gpu/topk.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/topk.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_topk::compute_shape(std::vector<shape> inputs) const
{
return op.normalize_compute_shape({inputs.front()});
}
argument hip_topk::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
auto outputs = args.back().get_sub_objects();
return op.largest ? device::topk_largest(ctx.get_stream().get(),
outputs.front(),
outputs.back(),
args[0],
op.k,
op.axis)
: device::topk_smallest(ctx.get_stream().get(),
outputs.front(),
outputs.back(),
args[0],
op.k,
op.axis);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -10,10 +10,10 @@
#include <migraphx/op/dot.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/op/elu.hpp>
#include <migraphx/op/if_op.hpp>
#include <migraphx/op/im2col.hpp>
#include <migraphx/op/leaky_relu.hpp>
#include <migraphx/op/logsoftmax.hpp>
#include <migraphx/op/loop.hpp>
#include <migraphx/op/lrn.hpp>
#include <migraphx/op/pad.hpp>
#include <migraphx/op/pooling.hpp>
......@@ -205,7 +205,10 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
}
std::string name() const { return "ref::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
shape compute_shape(const std::vector<shape>& inputs) const
{
return op.normalize_compute_shape(inputs);
}
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
......@@ -266,99 +269,6 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
}
};
template <class Op>
struct ref_deconvolution : auto_register_op<ref_deconvolution<Op>>
{
ref_deconvolution() = default;
ref_deconvolution(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "ref::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
using type = typename decltype(output)::value_type;
std::fill(output.begin(), output.end(), type{0});
auto in_lens = input.get_shape().lens();
auto in_n = in_lens[0];
auto in_c = in_lens[1];
auto wei = weights.get_shape().lens();
auto wei_n = wei[0];
auto wei_c = wei[1];
auto out_lens = output_shape.lens();
auto kdims = op.kdims();
std::vector<std::size_t> win_size{in_c};
std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
shape win_shape{output_shape.type(), win_size};
par_dfor(in_n, wei_c)([&](int o, int k) {
shape_for_each(win_shape, [&](auto idx_win) {
const int w = idx_win[0];
auto input_dims_start = idx_win.begin() + 1;
auto wei_dims_start = idx_win.begin() + kdims + 1;
std::vector<std::ptrdiff_t> win_start;
for(std::size_t n = 0; n < kdims; ++n)
{
win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
std::ptrdiff_t(op.padding[n]));
}
const int group_id = w / (wei_n / op.group);
const int in_ch = group_id * wei_c + k;
std::vector<std::ptrdiff_t> idx_out{o, in_ch};
for(size_t n = 0; n < kdims; n++)
{
idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
}
std::vector<std::ptrdiff_t> idx_wei{w, k};
std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));
std::vector<std::ptrdiff_t> idx_in{o, w};
std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));
if(std::all_of(
idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
std::equal(idx_out.begin() + 2,
idx_out.end(),
out_lens.begin() + 2,
out_lens.end(),
std::less<std::ptrdiff_t>{}))
{
output(idx_out.begin(), idx_out.end()) +=
input(idx_in.begin(), idx_in.end()) *
weights(idx_wei.begin(), idx_wei.end());
}
});
});
});
return result;
}
};
struct ref_im2col
{
op::im2col op;
......@@ -370,7 +280,10 @@ struct ref_im2col
}
static std::string name() { return "ref::im2col"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
shape compute_shape(const std::vector<shape>& inputs) const
{
return op.normalize_compute_shape(inputs);
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
......@@ -471,7 +384,10 @@ struct ref_pooling : auto_register_op<ref_pooling<Op>>
}
std::string name() const { return "ref::pooling_" + Op::name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
shape compute_shape(const std::vector<shape>& inputs) const
{
return op.normalize_compute_shape(inputs);
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
......@@ -602,42 +518,12 @@ struct ref_gemm
return migraphx::reflect(self.op, f);
}
std::string name() const { return "ref::dot"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
if(inputs.size() == 3)
{
auto c_shape = inputs.at(2);
check_shapes{{c_shape}, *this}.not_broadcasted();
}
return op.compute_shape(inputs);
}
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrices, and C is of the same shape as A * B
if(args.size() == 3)
{
// no need to consider the value of args[2]
if(op.beta == 0.0f)
{
result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
}
else
{
visit_all(result, args[2])([&](auto output, auto input) {
std::copy(input.begin(), input.end(), output.begin());
});
}
migemm(result, args[0], args[1], op.alpha, op.beta);
return result;
}
// 2 input arguments
migemm(result, args[0], args[1], op.alpha, 0.0f);
migemm(result, args[0], args[1], 1.0f, 0.0f);
return result;
}
......@@ -655,22 +541,11 @@ struct ref_quant_gemm
}
std::string name() const { return "ref::quant_dot"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
if(inputs.size() == 3)
{
auto c_shape = inputs.at(2);
check_shapes{{c_shape}, *this}.not_broadcasted();
}
return op.compute_shape(inputs);
}
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrices, and C is of the same shape to A * B
// first, convert the args[0] and args[1] from int8_t to int32_t
argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
......@@ -684,27 +559,7 @@ struct ref_quant_gemm
[&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
});
if(args.size() == 3)
{
// no need to consider the value of args[2]
if(op.beta == 0)
{
result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
}
else
{
visit_all(result, args[2])([&](auto output, auto input) {
std::copy(input.begin(), input.end(), output.begin());
});
}
migemm(result, arg_0, arg_1, op.alpha, op.beta);
return result;
}
// 2 input arguments
migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
migemm(result, arg_0, arg_1, int32_t{1}, int32_t{0});
return result;
}
......@@ -908,10 +763,8 @@ struct ref_apply
apply_map["batch_norm_inference"] =
extend_op<ref_batch_norm_inference, op::batch_norm_inference>();
apply_map["convolution"] = extend_op<ref_convolution<op::convolution>, op::convolution>();
apply_map["deconvolution"] =
extend_op<ref_deconvolution<op::deconvolution>, op::deconvolution>();
apply_map["dot"] = extend_op<ref_gemm, op::dot>();
apply_map["quant_dot"] = extend_op<ref_quant_gemm, op::quant_dot>();
apply_map["dot"] = extend_op<ref_gemm, op::dot>();
apply_map["quant_dot"] = extend_op<ref_quant_gemm, op::quant_dot>();
apply_map["quant_convolution"] =
extend_op<ref_convolution<op::quant_convolution>, op::quant_convolution>();
apply_map["elu"] = extend_op<ref_unary<elu_op>, op::elu>();
......
......@@ -5,6 +5,8 @@
#include <migraphx/pass.hpp>
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/rewrite_rnn.hpp>
#include <migraphx/eliminate_pad.hpp>
#include <migraphx/insert_pad.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/normalize_ops.hpp>
......@@ -18,6 +20,10 @@ std::string target::name() const { return "ref"; }
std::vector<pass> target::get_passes(migraphx::context&, const compile_options&) const
{
return {normalize_ops{},
eliminate_pad{},
dead_code_elimination{},
insert_pad{},
dead_code_elimination{},
rewrite_rnn{},
dead_code_elimination{},
auto_contiguous{},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment