Commit cd4ab535 authored by Khalique Ahmed's avatar Khalique Ahmed
Browse files

manual merge

parents 3891ee58 a0fa3742
......@@ -32,8 +32,17 @@
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... private_lisft_xs) MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lisft_xs)>(private_lisft_xs)...))
[](auto&&... private_lifts_xs) MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT_CLASS(name, ...) \
struct name \
{ \
template <class... PrivateLiftTs> \
constexpr auto operator()(PrivateLiftTs&&... private_lifts_xs) const MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...)) \
}
namespace migraphx {
......@@ -195,6 +204,14 @@ constexpr auto compose(Fs... fs)
})(fs...);
}
template <class F>
constexpr auto partial(F f)
{
return [=](auto... xs) {
return [=](auto&&... ys) { return f(xs..., static_cast<decltype(ys)>(ys)...); };
};
}
template <class... Ts>
constexpr auto pack(Ts... xs)
{
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
#define MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/index.hpp>
namespace migraphx {
template <class Tensor>
constexpr auto gemm_get_batches()
{
constexpr auto lens = get_shape_c<Tensor>{}.lens;
constexpr auto strides = get_shape_c<Tensor>{}.strides;
constexpr auto new_lens = sequence(
lens.size() - _c<2>, [&](auto... is) { return make_const_array(_c<lens[is]>...); });
constexpr auto new_strides = sequence(
strides.size() - _c<2>, [&](auto... is) { return make_const_array(_c<strides[is]>...); });
return make_shape(new_lens, new_strides);
}
template <class Tensor>
constexpr auto gemm_get_matrix()
{
constexpr auto lens = get_shape_c<Tensor>{}.lens;
constexpr auto strides = get_shape_c<Tensor>{}.strides;
constexpr auto m = lens.size() - _c<2>;
constexpr auto n = lens.size() - _c<1>;
constexpr auto new_lens = make_const_array(_c<lens[m]>, _c<lens[n]>);
constexpr auto new_strides = make_const_array(_c<strides[m]>, _c<strides[n]>);
return make_shape(new_lens, new_strides);
}
template <class Tensor, class T>
constexpr auto gemm_batch_slice(Tensor t, T i)
{
constexpr auto batch = gemm_get_batches<Tensor>();
constexpr auto matrix = gemm_get_matrix<Tensor>();
MIGRAPHX_ASSERT((batch.index(i) + matrix.element_space()) <= t.get_shape().element_space());
return make_tensor_view(t.data() + batch.index(i), matrix);
}
template <class BlocksPerBatch, class T, class... Ts>
constexpr auto gemm_batch_args(index idx, BlocksPerBatch bpb, T x, Ts... xs)
{
return [=](auto f) {
// All tensors should have the same rank
static_assert(
(true and ... and (get_shape_c<T>{}.lens.size() == get_shape_c<Ts>{}.lens.size())));
if constexpr(get_shape_c<T>{}.lens.size() > 2)
{
// Get the first batch since all batches should have the same number of elements
constexpr auto batch = gemm_get_batches<T>();
static_assert(
(true and ... and (batch.elements() == gemm_get_batches<Ts>().elements())));
idx.group_stride(bpb * batch.elements(), [&](auto gidx) {
const auto batch_idx = gidx / bpb;
f(gemm_batch_slice(x, batch_idx), gemm_batch_slice(xs, batch_idx)...);
});
}
else
{
f(x, xs...);
}
};
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
......@@ -28,10 +28,6 @@
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/math_functions.h>
#include <hip/hip_math_constants.h>
#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
#include <hip/hip_common.h>
#include <hip/hip_math_constants.h>
#endif
#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
......@@ -130,6 +130,8 @@ struct index
return blockDim.x;
}
#endif
constexpr auto ngroup() const { return nglobal() / max_nlocal(); }
template <class N, class Stride>
static constexpr auto max_stride_iterations(N n, Stride stride)
{
......@@ -231,8 +233,20 @@ struct index
{
for_stride<true>(local, n, nlocal(), f);
}
template <class F, class N>
__device__ void group_stride(N n, F f) const
{
for_stride<false>(group, n, ngroup(), f);
}
};
#ifdef MIGRAPHX_NLOCAL
#define MIGRAPHX_GLOBAL \
__global__ __attribute__((amdgpu_flat_work_group_size(MIGRAPHX_NLOCAL, MIGRAPHX_NLOCAL)))
#else
#define MIGRAPHX_GLOBAL __global__
#endif
inline __device__ __attribute__((const)) index make_index()
{
return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
......
......@@ -138,7 +138,7 @@ MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isnan, ::__hisnan)
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, log, ::hlog)
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, rsqrt, ::hrsqrt)
// MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sqrt, ::hsqrt)
// Use float to compute half overload
......@@ -161,8 +161,7 @@ MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
// Map math functions to hip half2 functions
// The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
// packed into a 32-bit number. See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
// Most but not all of these math ops have operators of the same names. Ones not yet implemented
// at this time are: exp2, exp10, log2, log10, isinf
// Most but not all of these math ops have operators of the same names.
MIGRAPHX_DEVICE_MATH_HALF2(abs, ::__habs2)
MIGRAPHX_DEVICE_MATH_HALF2(ceil, ::h2ceil)
MIGRAPHX_DEVICE_MATH_HALF2(cos, ::h2cos)
......@@ -176,7 +175,7 @@ MIGRAPHX_DEVICE_MATH_HALF2(log, ::h2log)
MIGRAPHX_DEVICE_MATH_HALF2(log10, ::h2log10)
MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
MIGRAPHX_DEVICE_MATH_HALF2(rsqrt, ::h2rsqrt)
// MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
MIGRAPHX_DEVICE_MATH_HALF2(sqrt, ::h2sqrt)
template <class T, class U>
......@@ -189,9 +188,8 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, max, ::max)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, min, ::min)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, max, ::max)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
// Add overloads for half that calls the float version
MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::fmaxf)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::fminf)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)
template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
constexpr auto max(const T& a, const T& b)
......@@ -217,14 +215,6 @@ constexpr auto min(const T& a, const U& b)
return min<common_type_t<T, U>>(a, b);
}
// Sin for half is broken on hip, so use cos instead
template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
constexpr T sin(T x)
{
constexpr const T shift = HIP_PIO2_F;
return migraphx::cos(shift - x);
}
MIGRAPHX_DEVICE_MATH_VEC(abs)
MIGRAPHX_DEVICE_MATH_VEC(acos)
MIGRAPHX_DEVICE_MATH_VEC(acosh)
......
......@@ -244,13 +244,13 @@ __device__ void print_once(Ts... xs)
template <class... Ts>
__device__ void println(Ts... xs)
{
print_each(&coutln, xs...);
print_each(&cout, xs..., '\n');
}
template <class... Ts>
__device__ void println_once(Ts... xs)
{
print_each_once(&coutln, xs...);
print_each_once(&cout, xs..., '\n');
}
} // namespace migraphx
......
......@@ -79,20 +79,21 @@ __device__ void dpp_reduce(T& in, Op op)
#endif
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE(op, prefix) \
#define MIGRAPHX_DPP_REDUCE(op, prefix, sign) \
__device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
__device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); } \
__device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); } \
__device__ inline void dpp_reduce(int32_t& x, op) \
{ \
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); \
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##sign##32); \
} \
__device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
MIGRAPHX_DPP_REDUCE(op::sum, v_add)
MIGRAPHX_DPP_REDUCE(op::max, v_max)
MIGRAPHX_DPP_REDUCE(op::min, v_min)
MIGRAPHX_DPP_REDUCE(op::product, v_mul)
// Note: when max and min are in int32_t, signed version of instruction needs to be used.
MIGRAPHX_DPP_REDUCE(op::sum, v_add, _u)
MIGRAPHX_DPP_REDUCE(op::product, v_mul, _u)
MIGRAPHX_DPP_REDUCE(op::max, v_max, _i)
MIGRAPHX_DPP_REDUCE(op::min, v_min, _i)
template <class Op, class T, class Index, class F>
__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
......@@ -174,6 +175,25 @@ struct inner_storage_tag
template <class T>
using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>;
template <class Size, class F>
struct lazy_inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
constexpr lazy_inner_storage<Size, F> make_lazy_inner_storage(Size, F f)
{
return {{}, f};
}
template <class R, class F>
struct storage_access : F
{
......@@ -278,6 +298,14 @@ struct reducer_base
});
}
template <class F>
__device__ auto lazy_inner(F f) const
{
return this->inner_sliced([=](auto n, auto&&... xs) {
return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
});
}
template <class Op, class T, class Read>
__device__ auto reduce(Op op, T init, Read read) const
{
......@@ -396,25 +424,6 @@ struct block_large
index idx;
Slicer slice;
template <class Size, class F>
struct inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
static constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
{
return {{}, {f}};
}
template <class Op, class T, class Read, class N, class... Ts>
__device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
{
......@@ -439,7 +448,7 @@ struct block_large
template <class R, class F, class N, class... Ts>
__device__ auto inner_impl(F f, N n, Ts&&... xs) const
{
return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
}
};
......@@ -469,25 +478,6 @@ struct lane
index idx;
Slicer slice;
template <class Size, class F>
struct inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
static constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
{
return {{}, {f}};
}
template <class Op, class T, class Read, class N, class U, class... Us>
__device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
{
......@@ -518,7 +508,7 @@ struct lane
template <class R, class F, class N, class... Ts>
__device__ auto inner_impl(F f, N n, Ts&&... xs) const
{
return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
}
};
template <class Slicer>
......@@ -577,5 +567,21 @@ simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOu
});
}
template <class Algo, class Reduced, class Output, class F>
__device__ void fused_reduce(Output output, F f)
{
Algo::template run<Reduced>([&](auto out_idx, auto r) {
auto result = f(r, out_idx);
if constexpr(reduce::is_inner_storage<decltype(result)>{})
{
r.inner([&](auto& y, auto x) { y = x; })(output, result);
}
else
{
r.outer([&] { output[out_idx] = implicit_conversion(result); });
}
});
}
} // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
......@@ -218,7 +218,15 @@ using common_type_t = typename common_type<Ts...>::type;
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
constexpr unsigned long int_max(unsigned long n)
{
// Note, left shift cannot be used to get the maximum value of int64_type or
// uint64_type because it is undefined behavior to left shift 64 bits for
// these types
if(n == sizeof(int64_t))
return -1;
return (1ul << (n * 8)) - 1;
}
template <class T,
MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
......@@ -228,9 +236,9 @@ constexpr T numeric_max()
if constexpr(is_integral<T>{})
{
if constexpr(is_unsigned<T>{})
return int_max(sizeof(T)) * 2;
else
return int_max(sizeof(T));
else
return int_max(sizeof(T)) / 2;
}
else if constexpr(is_same<T, double>{})
return __DBL_MAX__;
......
......@@ -135,7 +135,7 @@ constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
return vec<T, N>{x};
else
{
MIGRAPHX_ASSERT((i + N) < vec_size<T>());
MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
vec<vec_type<T>, N> result = {0};
for(int j = 0; j < N; j++)
{
......
......@@ -83,7 +83,8 @@ struct miopen_apply
auto& ctx = get_context();
int8_x4_format = get_int8_x4_format(ctx);
compute_fp32 = get_compute_fp32_flag();
offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
// TODO: Set Offload copy based on root modules' compile options
offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
add_generic_op("contiguous");
......
......@@ -30,6 +30,7 @@
#include <mlir-c/BuiltinTypes.h>
#include <mlir-c/Diagnostics.h>
#include <mlir-c/Dialect/MIGraphX.h>
#include <mlir-c/Dialect/Rock.h>
#include <mlir-c/IntegerSet.h>
#include <mlir-c/Pass.h>
#include <mutex>
......@@ -55,12 +56,16 @@
#include <migraphx/permutation.hpp>
#include <deque>
#include <variant>
#include <fstream>
#include <sstream>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_DB);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_CFG);
#ifdef MIGRAPHX_MLIR
template <class T, class F, F f> // NOLINT
......@@ -124,6 +129,8 @@ using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
using mlir_region = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy);
using mlir_block = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy);
using mlir_pass_manager = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
using mlir_tuning_table = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
mlirRockTuningTableDestroy);
std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }
......@@ -157,10 +164,10 @@ std::string mlir_print(F f, T x)
return ss.str();
}
const std::unordered_set<std::string>& get_xdlops_archs()
bool has_xdlops(const std::string& target_arch)
{
static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
return supported_archs;
const auto device_name = trim(split_string(target_arch, ':').front());
return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
}
struct mlir_program
......@@ -190,10 +197,14 @@ struct mlir_program
result = mlirF64TypeGet(ctx.get());
else if(as.is_integral())
{
if(as.is_signed())
result = mlirIntegerTypeSignedGet(ctx.get(), as.size() * 8);
else
result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
// Note: rocMLIR use signless integer type for tensors types. This
// will translate to signed implementation for current supported
// operations.
if(as.is_unsigned())
{
MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
}
result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
}
else
MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
......@@ -313,7 +324,8 @@ struct mlir_program
std::string,
value,
std::vector<value>,
MlirType>;
MlirType,
MlirAttribute>;
using named_attribute_t = std::pair<std::string_view, attribute_t>;
MlirNamedAttribute name_attribute(const named_attribute_t& na) const
......@@ -455,7 +467,7 @@ struct mlir_program
auto ops = create_operation_state("func.func");
ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
{"sym_name", std::string("main")},
{"sym_name", sym_name},
{"kernel", std::string("mixr")},
{"arch", target_arch}});
ops.add_region(std::move(region));
......@@ -470,13 +482,17 @@ struct mlir_program
{
if(ins->name() == "@return")
return "func.return";
if(ins->name() == "@literal")
{
return "tosa.const";
}
return "migraphx." + ins->name();
}
static value get_operator_value(const operation& op)
{
auto v = op.to_value();
if(op.name() == "convolution")
if(op.name() == "convolution" or op.name() == "quant_convolution")
{
// Adjust symetrical padding
if(v.at("padding").size() == v.at("stride").size())
......@@ -498,31 +514,53 @@ struct mlir_program
return ins->get_shape();
}
static std::string get_symbol_name(const module& m)
{
for(auto ins : iterator_for(m))
{
if(ins->name() == "convolution" or ins->name() == "dot")
{
return "mlir_" + ins->name();
}
}
return "main";
}
void parse(const module& m)
{
sym_name = get_symbol_name(m);
auto mbody = mlirModuleGetBody(mmodule.get());
std::unordered_map<instruction_ref, MlirValue> ins_map;
auto fbody = insert(mbody, m, ins_map);
for(auto ins : iterator_for(m))
{
if(ins->name() == "@param")
continue;
if(ins->name() == "contiguous")
{
ins_map[ins] = ins_map[ins->inputs().at(0)];
continue;
}
auto name = get_name(ins);
auto ops = create_operation_state(name);
ops.add_attribute_value(get_operator_value(ins->get_operator()));
if(ins->name() != "@return")
ops.add_results({get_shape(ins)});
if(ins->name() == "convolution")
if(ins->name() == "@literal")
{
literal r = ins->get_literal();
MlirType tensor_type = make_tensor(ins->get_shape());
MlirAttribute mlir_value_attr =
mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
ops.add_attributes({{"value", mlir_value_attr}});
}
if(ins->name() == "convolution" or ins->name() == "dot")
{
pp =
problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
// check if HW supports xdlops
auto target_chip = trim(split_string(target_arch, ':').front());
bool xdlops = contains(get_xdlops_archs(), target_chip);
std::string tuned = get_tune_params(xdlops);
if(not tuned.empty())
ops.add_attributes({{"perf_config", tuned}});
if(xdlops)
if(has_xdlops(target_arch))
ops.add_attributes({{"xdlopsV2", true}});
}
......@@ -542,15 +580,19 @@ struct mlir_program
code_object_op compile() MIGRAPHX_TIDY_CONST
{
mlir_pass_manager pm{mlirPassManagerCreate(ctx.get())};
mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
// 1st pipeline to call
mlirMIGraphXAddHighLevelPipeline(pm.get());
mlirMIGraphXAddHighLevelPipeline(pm_front.get());
mlirPassManagerRun(pm_front.get(), mmodule.get());
// 2nd pipeline to call
mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
mlirPassManagerRun(pm.get(), mmodule.get());
get_module_tuned();
mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
mlirPassManagerRun(pm_back.get(), mmodule.get());
code_object_op op{};
op.symbol_name = "main";
op.symbol_name = sym_name;
op.code_object = get_binary();
std::tie(op.global, op.local) = get_launch_params();
return op;
......@@ -578,7 +620,74 @@ struct mlir_program
MIGRAPHX_THROW("Failed to compile mlir program");
}
std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); }
std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }
// This function appends to tuning cfg file that could be
// used with rocMLIR tuning scripts.
void dump_tuning_cfg(const char* prob_config) const
{
std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
if(!tuning_cfg_path.empty())
{
std::vector<std::string> tokens = split_string(prob_config, '\t');
std::string prob = tokens[1];
if(starts_with(prob, "conv"))
{
tuning_cfg_path += ".conv";
}
else
{
tuning_cfg_path += ".gemm";
}
std::ofstream tuning_cfg(tuning_cfg_path, std::ios::app);
tuning_cfg << prob << std::endl;
}
}
static mlir_tuning_table create_tuning_table()
{
mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
if(!tuning_db_path.empty())
{
std::ifstream tuning_db_tsv(tuning_db_path);
if(tuning_db_tsv)
{
std::string line;
while(std::getline(tuning_db_tsv, line))
{
std::vector<std::string> tokens = split_string(line, '\t');
std::string arch = tokens[0];
std::string prob = tokens[1];
std::string perf = tokens[2];
std::string key = arch.append("\t").append(prob);
mlirRockTuningUpdateTable(tuning_table.get(), key.c_str(), perf.c_str(), 1.0);
}
}
}
else
{
std::cerr
<< "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
"optimal performance."
<< std::endl;
}
return tuning_table;
}
bool get_module_tuned() const
{
static mlir_tuning_table tuning_table = create_tuning_table();
if(!mlirRockTuningSetFromTable(tuning_table.get(), mmodule.get()))
{
const char* prob_config = mlirRockTuningGetKey(tuning_table.get(), mmodule.get());
std::stringstream key(prob_config);
std::cerr << "fails to set param on" << prob_config << std::endl;
dump_tuning_cfg(prob_config);
return false;
}
return true;
}
mlir_context ctx;
MlirLocation location;
......@@ -586,6 +695,7 @@ struct mlir_program
problem_params pp;
std::deque<std::string> strings{};
std::string target_arch;
std::string sym_name;
};
std::string dump_mlir(const module& m)
......@@ -645,12 +755,13 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
{
adjust_param_shapes(m, inputs);
const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
if(trace)
std::cout << m << std::endl;
// set mutex while llvm thread support is disabled.
static std::mutex g_mlirc_mutex; // NOLINT
const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
if(trace)
std::cout << m << std::endl;
mlir_program mp;
mp.find_target();
mp.parse(m);
......
......@@ -47,32 +47,17 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
return rb;
}
const std::unordered_set<std::string>& get_rocblas_fp32_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
return supported_archs;
}
bool get_compute_fp32_flag()
{
bool compute_fp32 = false;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
const auto device_name = trim(split_string(get_device_name(), ':').front());
if(contains(get_rocblas_fp32_archs(), device_name))
compute_fp32 = true;
#endif
return compute_fp32;
return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
}
bool get_int8_x4_format(context& ctx)
{
bool int8_x4_format = true;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
rocblas_gemm_flags flag;
rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
#endif
return int8_x4_format;
return flag == rocblas_gemm_flags_pack_int8x4;
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -26,7 +26,6 @@
#include <migraphx/check_context.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_allocation.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/eliminate_concat.hpp>
#include <migraphx/eliminate_contiguous.hpp>
#include <migraphx/eliminate_data_type.hpp>
......@@ -34,6 +33,7 @@
#include <migraphx/eliminate_layout.hpp>
#include <migraphx/eliminate_pad.hpp>
#include <migraphx/fuse_pointwise.hpp>
#include <migraphx/fuse_reduce.hpp>
#include <migraphx/inline_module.hpp>
#include <migraphx/insert_pad.hpp>
#include <migraphx/layout_nhwc.hpp>
......@@ -41,7 +41,7 @@
#include <migraphx/normalize_ops.hpp>
#include <migraphx/optimize_module.hpp>
#include <migraphx/preallocate_param.hpp>
#include <migraphx/propagate_constant.hpp>
#include <migraphx/promote_literals.hpp>
#include <migraphx/register_target.hpp>
#include <migraphx/replace_allocate.hpp>
#include <migraphx/rewrite_gelu.hpp>
......@@ -49,15 +49,16 @@
#include <migraphx/rewrite_quantization.hpp>
#include <migraphx/rewrite_rnn.hpp>
#include <migraphx/schedule.hpp>
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/split_single_dyn_dim.hpp>
#include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/compile_miopen.hpp>
#include <migraphx/gpu/compile_ops.hpp>
#include <migraphx/gpu/concat_gpu_opt.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/fuse_ck.hpp>
#include <migraphx/gpu/fuse_mlir.hpp>
#include <migraphx/gpu/fuse_ops.hpp>
#include <migraphx/gpu/prefuse_ops.hpp>
......@@ -73,8 +74,11 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_FAST_GELU)
struct id_pass
{
std::string name() const { return "id"; }
......@@ -98,14 +102,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
unsupported_types.erase(shape::type_t::bool_type);
unsupported_types.erase(shape::type_t::int8_type);
unsupported_types.erase(shape::type_t::uint8_type);
unsupported_types.erase(shape::type_t::int32_type);
unsupported_types.erase(shape::type_t::tuple_type);
// clang-format off
return
{
split_single_dyn_dim{},
dead_code_elimination{},
normalize_ops{},
dead_code_elimination{},
simplify_qdq{},
rewrite_quantization{},
enable_pass(not mlir_enabled(), rewrite_quantization{}),
dead_code_elimination{},
eliminate_data_type{unsupported_types, shape::type_t::float_type},
simplify_reshapes{},
......@@ -119,20 +126,21 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
inline_module{},
rewrite_pooling{},
dead_code_elimination{},
rewrite_gelu{},
enable_pass(not enabled(MIGRAPHX_DISABLE_FAST_GELU{}), rewrite_gelu{}),
optimize_module{},
enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
dead_code_elimination{},
prefuse_ops{},
dead_code_elimination{},
auto_contiguous{},
// enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{true}),
simplify_reshapes{},
propagate_constant{},
optimize_module{},
fuse_pointwise{},
dead_code_elimination{},
enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
dead_code_elimination{},
enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
dead_code_elimination{},
fuse_mlir{&ctx},
enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
dead_code_elimination{},
lowering{&ctx, options.offload_copy},
eliminate_contiguous{"gpu::contiguous"},
......@@ -151,7 +159,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
compile_ops{&ctx},
compile_ops{&ctx, options.exhaustive_tune},
dead_code_elimination{},
promote_literals{},
dead_code_elimination{},
write_literals{&ctx},
schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
......
......@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/time_op.hpp>
#include <migraphx/context.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/time.hpp>
......@@ -30,7 +30,6 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsigned long seed = 0)
{
......@@ -69,7 +68,6 @@ time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
return std::make_pair(host_time / n, device_time / n);
}
} // namespace driver
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -31,10 +31,9 @@ set_target_properties(migraphx_ref PROPERTIES EXPORT_NAME ref)
rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION})
find_path(BLAZE_INCLUDE blaze/Blaze.h)
find_package(Threads)
rocm_clang_tidy_check(migraphx_ref)
target_link_libraries(migraphx_ref migraphx Threads::Threads)
target_link_libraries(migraphx_ref PUBLIC migraphx)
target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)
......
......@@ -132,109 +132,6 @@ auto visit_quantize(T&& x, Ts&&... xs)
};
}
template <class Op>
struct ref_convolution : auto_register_op<ref_convolution<Op>>
{
ref_convolution() = default;
ref_convolution(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "ref::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
return op.normalize_compute_shape(inputs);
}
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
std::vector<std::size_t> padding;
if(op.padding_mode != op::padding_mode_t::default_)
{
auto input_lens = args[0].get_shape().lens();
auto weights_lens = args[1].get_shape().lens();
padding =
op.padding_mode == op::same_upper
? calc_dyn_auto_pad(input_lens, weights_lens, op.stride, op.dilation, true)
: calc_dyn_auto_pad(input_lens, weights_lens, op.stride, op.dilation, false);
output_shape = compute_padded_shape(
args[0].get_shape(), args[1].get_shape(), padding, op.stride, op.dilation);
}
else
{
padding = op.padding;
if(output_shape.dynamic())
{
output_shape =
op.normalize_compute_shape({args.at(0).get_shape(), args.at(1).get_shape()});
}
}
argument result{output_shape};
visit_quantize(result, args[0], args[1])([&](auto output, auto input, auto weights) {
auto in_lens = input.get_shape().lens();
auto wei_lens = weights.get_shape().lens();
auto wei_n = wei_lens[0];
auto wei_c = wei_lens[1];
std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());
par_for(output_shape.elements(), [&](auto i) {
auto idx_o = output_shape.multi(i);
auto w = idx_o[1];
auto n_dim = idx_o.size();
std::vector<std::ptrdiff_t> win_start;
for(std::size_t dim = 2; dim < n_dim; ++dim)
{
auto d_2 = dim - 2;
win_start.push_back(std::ptrdiff_t(idx_o[dim] * op.stride[d_2]) -
std::ptrdiff_t(padding[d_2]));
}
const auto group_id = w / (wei_n / op.group);
shape win_shape{output_shape.type(), win_size};
double acc = 0.0;
shape_for_each(win_shape, [&](auto idx_win) {
auto k = idx_win[0];
const auto in_ch = group_id * wei_c + k;
std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
idx[1] = in_ch;
std::transform(idx_win.begin() + 1,
idx_win.end(),
win_start.begin(),
idx.begin() + 2,
[](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
idx_wei[0] = w;
std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
std::equal(idx.begin(),
idx.end(),
in_lens.begin(),
in_lens.end(),
std::less<std::ptrdiff_t>{}))
{
acc +=
input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
}
});
output[i] = acc;
});
});
return result;
}
};
struct ref_im2col
{
op::im2col op;
......@@ -564,11 +461,8 @@ struct ref_apply
void init()
{
apply_map["convolution"] = extend_op<ref_convolution<op::convolution>, op::convolution>();
apply_map["dot"] = extend_op<ref_gemm, op::dot>();
apply_map["quant_dot"] = extend_op<ref_quant_gemm, op::quant_dot>();
apply_map["quant_convolution"] =
extend_op<ref_convolution<op::quant_convolution>, op::quant_convolution>();
apply_map["dot"] = extend_op<ref_gemm, op::dot>();
apply_map["quant_dot"] = extend_op<ref_quant_gemm, op::quant_dot>();
apply_map["im2col"] = extend_op<ref_im2col, op::im2col>();
apply_map["logsoftmax"] = extend_op<ref_softmax<op::logsoftmax>, op::logsoftmax>();
apply_map["lrn"] = extend_op<ref_lrn, op::lrn>();
......
......@@ -46,6 +46,7 @@ std::vector<std::string> get_op_parsers()
op_parser_map().end(),
std::back_inserter(result),
[&](auto&& p) { return p.first; });
std::sort(result.begin(), result.end());
return result;
}
......
......@@ -22,6 +22,7 @@
* THE SOFTWARE.
*/
#include <migraphx/tf/tf_parser.hpp>
#include <migraphx/tf/op_parser.hpp>
#include <iostream>
#include <fstream>
#include <unordered_map>
......@@ -62,5 +63,7 @@ program parse_tf(const std::string& name, const tf_options& options)
return std::move(parser.prog);
}
std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -28,6 +28,7 @@
#include <migraphx/stringutils.hpp>
#include <migraphx/value.hpp>
#include <migraphx/optional.hpp>
#include <migraphx/hash.hpp>
#include <unordered_map>
#include <utility>
......@@ -519,6 +520,38 @@ std::ostream& operator<<(std::ostream& os, const value& d)
return os;
}
template <class T>
std::size_t value_hash(const std::string& key, const T& x)
{
std::size_t h = hash_value(key);
hash_combine(h, x);
return h;
}
std::size_t value_hash(const std::string& key, std::nullptr_t) { return hash_value(key); }
std::size_t value_hash(const std::string& key, const std::vector<value>& x)
{
std::size_t h = hash_value(key);
for(const auto& v : x)
hash_combine(h, v);
return h;
}
std::size_t value_hash(const std::string& key, const value::binary& x)
{
std::size_t h = hash_value(key);
for(const auto& v : x)
hash_combine(h, v);
return h;
}
std::size_t value::hash() const
{
std::size_t h = 0;
this->visit_value([&](const auto& a) { h = value_hash(this->get_key(), a); });
return h;
}
void value::debug_print(bool show_type) const
{
if(show_type)
......
......@@ -110,7 +110,7 @@ function(add_test_executable TEST_NAME)
add_test_command(${TEST_NAME} ${TEST_COMMAND})
add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME})
target_link_libraries(${TEST_NAME} migraphx migraphx_onnx)
target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_ref)
target_include_directories(${TEST_NAME} PUBLIC include)
endfunction(add_test_executable)
......@@ -134,6 +134,9 @@ if(MIGRAPHX_ENABLE_GPU)
COST 10
RESOURCE_LOCK gpu
)
if(MIGRAPHX_USE_HIPRTC)
target_compile_definitions(test_gpu_${BASE_NAME} PUBLIC -DMIGRAPHX_USE_HIPRTC)
endif()
target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
endforeach()
endif()
......@@ -163,7 +166,7 @@ foreach(ONNX_TEST ${ONNX_TESTS})
set(TEST_NAME test_${BASE_NAME})
add_executable(${TEST_NAME} ${ONNX_TEST})
rocm_clang_tidy_check(${TEST_NAME})
target_link_libraries(${TEST_NAME} migraphx_onnx)
target_link_libraries(${TEST_NAME} migraphx_onnx migraphx_ref)
target_include_directories(${TEST_NAME} PUBLIC include)
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> WORKING_DIRECTORY ${TEST_ONNX_DIR})
add_dependencies(tests ${TEST_NAME})
......@@ -187,6 +190,25 @@ if(MIGRAPHX_ENABLE_PYTHON)
add_subdirectory(py)
endif()
# multitarget test
if(MIGRAPHX_ENABLE_GPU AND MIGRAPHX_ENABLE_CPU AND MIGRAPHX_ENABLE_FPGA)
set(TEST_MULTI_TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR}/multi_target)
file(GLOB MULTI_TARGET_TESTS ${CONFIGURE_DEPENDS} ${TEST_MULTI_TARGET_DIR}/*.cpp)
foreach(MULTI_TARGET_TEST ${MULTI_TARGET_TESTS})
get_filename_component(BASE_NAME ${MULTI_TARGET_TEST} NAME_WE)
set(TEST_NAME test_${BASE_NAME})
add_executable(${TEST_NAME} ${MULTI_TARGET_TEST})
rocm_clang_tidy_check(${TEST_NAME})
target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_tf migraphx_all_targets)
target_include_directories(${TEST_NAME} PUBLIC include)
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> WORKING_DIRECTORY ${TEST_MULTI_TARGET_DIR})
add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME})
endforeach()
endif()
function(test_header NAME HEADER)
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
"#include <${HEADER}>\nint main() {}\n"
......@@ -218,3 +240,11 @@ test_headers(migraphx/ref ${CMAKE_SOURCE_DIR}/src/targets/ref/include/migraphx/r
if(MIGRAPHX_ENABLE_GPU)
test_headers(migraphx/gpu ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/*.hpp)
endif()
if(MIGRAPHX_ENABLE_CPU)
test_headers(migraphx/cpu ${CMAKE_SOURCE_DIR}/src/targets/cpu/include/migraphx/cpu/*.hpp)
endif()
if(MIGRAPHX_ENABLE_FPGA)
test_headers(migraphx/fpga ${CMAKE_SOURCE_DIR}/src/targets/fpga/include/migraphx/fpga/*.hpp)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment