Unverified Commit 23cb7917 authored by Brian Pickrell's avatar Brian Pickrell Committed by GitHub
Browse files

Merge branch 'develop' into blas_tuning

parents b5fcc0bc ea32ca70
......@@ -218,7 +218,15 @@ using common_type_t = typename common_type<Ts...>::type;
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
constexpr unsigned long int_max(unsigned long n)
{
// Note, left shift cannot be used to get the maximum value of int64_type or
// uint64_type because it is undefined behavior to left shift 64 bits for
// these types
if(n == sizeof(int64_t))
return -1;
return (1ul << (n * 8)) - 1;
}
template <class T,
MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
......@@ -228,9 +236,9 @@ constexpr T numeric_max()
if constexpr(is_integral<T>{})
{
if constexpr(is_unsigned<T>{})
return int_max(sizeof(T)) * 2;
else
return int_max(sizeof(T));
else
return int_max(sizeof(T)) / 2;
}
else if constexpr(is_same<T, double>{})
return __DBL_MAX__;
......
......@@ -135,7 +135,7 @@ constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
return vec<T, N>{x};
else
{
MIGRAPHX_ASSERT((i + N) < vec_size<T>());
MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
vec<vec_type<T>, N> result = {0};
for(int j = 0; j < N; j++)
{
......
......@@ -22,12 +22,19 @@
* THE SOFTWARE.
*/
#include <iterator>
#include <migraphx/gpu/lowering.hpp>
#include <utility>
#include <functional>
#include <algorithm>
#include <map>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/instruction_ref.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/pass_manager.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/if_op.hpp>
......@@ -35,17 +42,12 @@
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/lowering.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/gemm.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
#include <utility>
#include <functional>
#include <algorithm>
#include <map>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -54,6 +56,7 @@ namespace gpu {
struct miopen_apply
{
module* mod = nullptr;
module_pass_manager* mpm = nullptr;
const lowering* pass = nullptr;
std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
instruction_ref last{};
......@@ -83,7 +86,7 @@ struct miopen_apply
auto& ctx = get_context();
int8_x4_format = get_int8_x4_format(ctx);
compute_fp32 = get_compute_fp32_flag();
offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
offload_copy = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
add_generic_op("contiguous");
......@@ -103,7 +106,7 @@ struct miopen_apply
add_extend_op("topk");
add_convolution_op("convolution");
add_convolution_op("deconvolution");
add_convolution_op("convolution_backwards");
add_convolution_op("quant_convolution");
add_gemm_op<op::dot>("dot");
add_gemm_op<op::quant_dot>("quant_dot");
......@@ -375,7 +378,10 @@ struct miopen_apply
}
};
void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
void lowering::apply(module_pass_manager& mpm) const
{
miopen_apply{&mpm.get_module(), &mpm, this}.apply();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -36,7 +36,10 @@
#include <mutex>
#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
#warning "Incompatible version of rocMLIR library used, disabling"
// Only undefine when not using cppcheck
#ifndef CPPCHECK
#undef MIGRAPHX_MLIR
#endif
#else
#include <mlir-c/RegisterRocMLIR.h>
#endif
......@@ -50,8 +53,10 @@
#include <migraphx/ranges.hpp>
#include <migraphx/gpu/code_object_op.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/perfdb.hpp>
#include <migraphx/gpu/tuning_config.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/permutation.hpp>
#include <deque>
......@@ -122,6 +127,9 @@ struct mlir_handle
#define MIGRAPHX_MANAGE_MLIR_HANDLE(T, F) migraphx::gpu::mlir_handle<T, decltype(&F), &F> // NOLINT
using mlir_context = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
using mlir_thread_pool = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirLlvmThreadPool, mlirLlvmThreadPoolDestroy);
using mlir_dialect_registry = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirDialectRegistry,
mlirDialectRegistryDestroy);
using mlir_module = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirModule, mlirModuleDestroy);
using mlir_operation = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOperation, mlirOperationDestroy);
using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
......@@ -131,6 +139,10 @@ using mlir_block = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockD
using mlir_pass_manager = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
using mlir_tuning_table = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
mlirRockTuningTableDestroy);
using mlir_tuning_space = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningSpace,
mlirRockTuningSpaceDestroy);
using mlir_tuning_param = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningParam,
mlirRockTuningParamDestroy);
std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }
......@@ -164,25 +176,41 @@ std::string mlir_print(F f, T x)
return ss.str();
}
const std::unordered_set<std::string>& get_xdlops_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
return supported_archs;
}
struct mlir_program
{
mlir_program()
: ctx(mlirContextCreate()),
: ctx(mlirContextCreateWithRegistry(get_dialect_registry().get(),
/*threadingEnable=*/false)),
location(mlirLocationUnknownGet(ctx.get())),
mmodule(mlirModuleCreateEmpty(location))
{
MlirDialectRegistry registry = mlirDialectRegistryCreate();
mlirRegisterRocMLIRDialects(registry);
mlirContextAppendDialectRegistry(ctx.get(), registry);
mlirContextSetThreadPool(ctx.get(), get_thread_pool().get());
mlirContextLoadAllAvailableDialects(ctx.get());
mlirDialectRegistryDestroy(registry);
mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
}
static mlir_dialect_registry& get_dialect_registry()
{
static std::once_flag init_guard;
static mlir_dialect_registry the_registry;
// The MLIR registration functions (for dialects and passes) are not
// necessarily thread-safe and need to be executed exactly once
// (especially since they eventually call non-thread-safe LLVM
// initilizations).
std::call_once(init_guard, [&]() {
the_registry = mlirDialectRegistryCreate();
mlirRegisterRocMLIRDialects(the_registry.get());
mlirRegisterRocMLIRPasses();
});
return the_registry;
}
static mlir_thread_pool& get_thread_pool()
{
// To save on overhead, we create one LLVM thread pool and reuse it
// across all MLIR contexts as recommended by MLIR upstream.
// Note that this is thread-safe as of C++11.
static mlir_thread_pool the_pool = mlirLlvmThreadPoolCreate();
return the_pool;
}
MlirType make_type(shape::type_t t) const
......@@ -244,8 +272,6 @@ struct mlir_program
MlirAttribute attribute(std::int64_t i) const
{
if(i < 0)
MIGRAPHX_THROW("MLIR cant handle negative values since they are ambiguous");
return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
}
MlirAttribute attribute(std::uint64_t i) const
......@@ -324,7 +350,8 @@ struct mlir_program
std::string,
value,
std::vector<value>,
MlirType>;
MlirType,
MlirAttribute>;
using named_attribute_t = std::pair<std::string_view, attribute_t>;
MlirNamedAttribute name_attribute(const named_attribute_t& na) const
......@@ -365,14 +392,20 @@ struct mlir_program
mlir_operation_state& add_attributes(const std::vector<named_attribute_t>& named_attrs)
{
auto attributes = prog->name_attributes(named_attrs);
if(not attributes.empty())
{
mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
}
return *this;
}
mlir_operation_state& add_attribute_value(const value& v)
{
auto attributes = prog->name_attributes(v);
if(not attributes.empty())
{
mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
}
return *this;
}
......@@ -395,13 +428,19 @@ struct mlir_program
return shape{r.type(), r.lens()};
});
auto x = prog->make_tensors(reshaped);
if(not x.empty())
{
mlirOperationStateAddResults(&op_state, x.size(), x.data());
}
return *this;
}
mlir_operation_state& add_operands(const std::vector<MlirValue>& inputs)
{
if(not inputs.empty())
{
mlirOperationStateAddOperands(&op_state, inputs.size(), inputs.data());
}
return *this;
}
......@@ -411,7 +450,10 @@ struct mlir_program
std::transform(regions.begin(), regions.end(), mregions.begin(), [](const auto& r) {
return r.get();
});
if(not mregions.empty())
{
mlirOperationStateAddOwnedRegions(&op_state, mregions.size(), mregions.data());
}
mlir_operation op(mlirOperationCreate(&op_state));
// Release memory since mlir_operation owns it
for(auto& r : regions)
......@@ -468,7 +510,8 @@ struct mlir_program
ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
{"sym_name", sym_name},
{"kernel", std::string("mixr")},
{"arch", target_arch}});
{"arch", target_arch},
{"num_cu", num_cu}});
ops.add_region(std::move(region));
insert(body, std::move(ops));
......@@ -481,6 +524,10 @@ struct mlir_program
{
if(ins->name() == "@return")
return "func.return";
if(ins->name() == "@literal")
{
return "tosa.const";
}
return "migraphx." + ins->name();
}
......@@ -511,14 +558,7 @@ struct mlir_program
static std::string get_symbol_name(const module& m)
{
for(auto ins : iterator_for(m))
{
if(ins->name() == "convolution" or ins->name() == "dot")
{
return "mlir_" + ins->name();
}
}
return "main";
return "mlir_" + gen::generate_name_from_ops(m);
}
void parse(const module& m)
......@@ -532,20 +572,28 @@ struct mlir_program
{
if(ins->name() == "@param")
continue;
if(ins->name() == "contiguous")
{
ins_map[ins] = ins_map[ins->inputs().at(0)];
continue;
}
auto name = get_name(ins);
auto ops = create_operation_state(name);
ops.add_attribute_value(get_operator_value(ins->get_operator()));
if(ins->name() != "@return")
ops.add_results({get_shape(ins)});
if(ins->name() == "@literal")
{
literal r = ins->get_literal();
MlirType tensor_type = make_tensor(ins->get_shape());
MlirAttribute mlir_value_attr =
mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
ops.add_attributes({{"value", mlir_value_attr}});
}
if(ins->name() == "convolution" or ins->name() == "dot")
{
pp =
problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
// check if HW supports xdlops
auto target_chip = trim(split_string(target_arch, ':').front());
bool xdlops = contains(get_xdlops_archs(), target_chip);
if(xdlops)
ops.add_attributes({{"xdlopsV2", true}});
}
std::vector<MlirValue> inputs;
......@@ -562,18 +610,30 @@ struct mlir_program
}
}
code_object_op compile() MIGRAPHX_TIDY_CONST
void run_high_level_pipeline() MIGRAPHX_TIDY_CONST
{
mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
// 1st pipeline to call
mlirMIGraphXAddHighLevelPipeline(pm_front.get());
mlirPassManagerRun(pm_front.get(), mmodule.get());
mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()));
}
// 2nd pipeline to call
get_module_tuned();
void run_backend_pipeline() MIGRAPHX_TIDY_CONST
{
mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
mlirPassManagerRun(pm_back.get(), mmodule.get());
mlirPassManagerRunOnOp(pm_back.get(), mlirModuleGetOperation(mmodule.get()));
}
code_object_op compile(const value& solution) MIGRAPHX_TIDY_CONST
{
// 1st pipeline to call
run_high_level_pipeline();
if(solution.is_null())
get_module_tuned();
else
set_tuning(solution);
// 2nd pipeline to call
run_backend_pipeline();
code_object_op op{};
op.symbol_name = sym_name;
......@@ -582,7 +642,12 @@ struct mlir_program
return op;
}
void find_target() { target_arch = get_device_name(); }
void set_gpu_properties(const context& migraphx_ctx)
{
const auto& device = migraphx_ctx.get_current_device();
target_arch = device.get_device_name();
num_cu = device.get_cu_count();
}
std::pair<std::size_t, std::size_t> get_launch_params() const
{
......@@ -596,7 +661,7 @@ struct mlir_program
value::binary get_binary() const
{
int size = 0;
size_t size = 0;
mlirGetBinary(mmodule.get(), &size, nullptr);
value::binary result(size);
if(mlirGetBinary(mmodule.get(), &size, reinterpret_cast<char*>(result.data())))
......@@ -604,14 +669,52 @@ struct mlir_program
MIGRAPHX_THROW("Failed to compile mlir program");
}
void set_tuning(const value& v) MIGRAPHX_TIDY_CONST
{
const auto* str = v.if_string();
if(str == nullptr)
MIGRAPHX_THROW("mlir tuning solutions must be strings");
if(not mlirRockTuningSetFromStr(mmodule.get(), make_mlir_string_ref(*str)))
MIGRAPHX_THROW("Failed setting tuning key: " + *str);
}
tuning_config get_tuning_config() MIGRAPHX_TIDY_CONST
{
tuning_config tc;
run_high_level_pipeline();
mlir_tuning_space params{
mlirRockTuningSpaceCreate(mmodule.get(), RocmlirTuningParamSetKindFull)};
for(auto i : range(mlirRockTuningGetNumParams(params.get())))
{
mlir_tuning_param param{mlirRockTuningParamCreate()};
if(not mlirRockTuningParamGet(params.get(), i, param.get()))
MIGRAPHX_THROW("Incorrect mlir tuning parameter: " + std::to_string(i));
std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> perf_key;
size_t perf_key_bytes =
mlirRockTuningParamToString(param.get(), perf_key.data(), perf_key.size());
if(perf_key_bytes > perf_key.size())
MIGRAPHX_THROW("Tuning perf key was " + std::to_string(perf_key_bytes) +
" bytes and thus too long");
tc.solutions.emplace_back(perf_key.begin(), perf_key.begin() + perf_key_bytes);
}
std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> tuning_key;
size_t tuning_key_bytes =
mlirRockTuningGetKey(mmodule.get(), tuning_key.data(), tuning_key.size());
if(tuning_key_bytes > tuning_key.size())
MIGRAPHX_THROW("Tuning table key was " + std::to_string(tuning_key_bytes) +
" bytes and thus too long");
tc.problem = std::string(tuning_key.begin(), tuning_key.begin() + tuning_key_bytes);
return tc;
}
std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }
// This function appends to tuning cfg file that could be
// used with rocMLIR tuning scripts.
void dump_tuning_cfg(const char* prob_config) const
void dump_tuning_cfg(const std::string& prob_config) const
{
std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
if(!tuning_cfg_path.empty())
if(not tuning_cfg_path.empty())
{
std::vector<std::string> tokens = split_string(prob_config, '\t');
std::string prob = tokens[1];
......@@ -628,46 +731,66 @@ struct mlir_program
}
}
static mlir_tuning_table create_tuning_table()
static std::pair<mlir_tuning_table, bool> load_tuning_table()
{
mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
bool found_table = false;
std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
if(!tuning_db_path.empty())
if(not tuning_db_path.empty())
{
std::ifstream tuning_db_tsv(tuning_db_path);
if(tuning_db_tsv)
{
found_table = true;
std::string line;
while(std::getline(tuning_db_tsv, line))
{
std::vector<std::string> tokens = split_string(line, '\t');
std::string arch = tokens[0];
std::string prob = tokens[1];
std::string perf = tokens[2];
std::string key = arch.append("\t").append(prob);
mlirRockTuningUpdateTable(tuning_table.get(), key.c_str(), perf.c_str(), 1.0);
std::string num_cu = tokens[1];
std::string prob = tokens[2];
std::string perf = tokens[3];
std::string key = arch.append("\t").append(num_cu).append("\t").append(prob);
mlirRockTuningUpdateTable(tuning_table.get(),
make_mlir_string_ref(key),
make_mlir_string_ref(perf),
1.0);
}
}
}
else
{
found_table = false;
std::cerr
<< "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
"optimal performance."
<< std::endl;
}
return tuning_table;
return std::make_pair(std::move(tuning_table), found_table);
}
bool get_module_tuned() const
{
static mlir_tuning_table tuning_table = create_tuning_table();
if(!mlirRockTuningSetFromTable(tuning_table.get(), mmodule.get()))
static std::pair<mlir_tuning_table, bool> tuning_table = load_tuning_table();
if(not mlirRockTuningSetFromTable(tuning_table.first.get(), mmodule.get()))
{
std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> prob_config;
size_t prob_config_bytes =
mlirRockTuningGetKey(mmodule.get(), prob_config.data(), prob_config.size());
if(prob_config_bytes >= prob_config.size())
{
const char* prob_config = mlirRockTuningGetKey(tuning_table.get(), mmodule.get());
std::stringstream key(prob_config);
std::cerr << "fails to set param on" << prob_config << std::endl;
dump_tuning_cfg(prob_config);
std::cerr << "MLIR tuning key overflowed buffer, needed " << prob_config_bytes
<< " bytes" << std::endl;
return false;
}
std::string prob_config_str(prob_config.begin(),
prob_config.begin() + prob_config_bytes);
if(tuning_table.second)
{
std::cerr << "NOTE: MLIR tuning table did not include a key for " << prob_config_str
<< std::endl;
}
dump_tuning_cfg(prob_config_str);
return false;
}
return true;
......@@ -678,7 +801,8 @@ struct mlir_program
mlir_module mmodule;
problem_params pp;
std::deque<std::string> strings{};
std::string target_arch;
std::string target_arch = "";
std::size_t num_cu = 0;
std::string sym_name;
};
......@@ -690,14 +814,14 @@ std::string dump_mlir(const module& m)
return mlir_print(&mlirOperationPrint, mod_op);
}
void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
{
auto names = m.get_parameter_names();
std::sort(names.begin(), names.end());
for(auto i : range(names.size()))
{
const auto& name = names[i];
const auto& input = inputs[i]->get_shape();
const auto& input = inputs[i];
auto param = m.get_parameter(name);
if(input.standard())
continue;
......@@ -735,23 +859,25 @@ void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
}
}
code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
code_object_op compile_mlir(const context& migraphx_ctx,
module m,
const std::vector<instruction_ref>& inputs,
const value& solution)
{
adjust_param_shapes(m, inputs);
adjust_param_shapes(m, to_shapes(inputs));
const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
if(trace)
std::cout << m << std::endl;
// set mutex while llvm thread support is disabled.
static std::mutex g_mlirc_mutex; // NOLINT
const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
mlir_program mp;
mp.find_target();
mp.set_gpu_properties(migraphx_ctx);
mp.parse(m);
auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
if(trace)
std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
auto co = mp.compile();
auto co = mp.compile(solution);
co.expected_inputs = to_shapes(inputs);
co.output = m.get_output_shapes().front();
return co;
}
......@@ -772,6 +898,17 @@ instruction_ref insert_mlir(module& m,
return m.insert_instruction(ins, co, refs);
}
tuning_config
get_tuning_config_mlir(const context& migraphx_ctx, module m, const std::vector<shape>& inputs)
{
adjust_param_shapes(m, inputs);
mlir_program mp;
mp.set_gpu_properties(migraphx_ctx);
mp.parse(m);
return mp.get_tuning_config();
}
#else
std::string dump_mlir(const module&) { return {}; }
......@@ -783,20 +920,27 @@ void use(T&)
// Disabling clang-tidy warning on non-real useage.
// NOLINTBEGIN(performance-unnecessary-value-param)
code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
code_object_op
compile_mlir(const context&, module, const std::vector<instruction_ref>&, const value&)
{
return {};
}
// NOLINTEND(performance-unnecessary-value-param)
instruction_ref
// cppcheck-suppress funcArgNamesDifferent
insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)
{
use(co);
use(m);
return m.end();
}
tuning_config get_tuning_config_mlir(const context&, module, const std::vector<shape>&)
{
return {};
}
// NOLINTEND(performance-unnecessary-value-param)
#endif
} // namespace gpu
......
......@@ -47,32 +47,24 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
return rb;
}
const std::unordered_set<std::string>& get_rocblas_fp32_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
return supported_archs;
}
bool get_compute_fp32_flag()
{
bool compute_fp32 = false;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
const auto device_name = trim(split_string(get_device_name(), ':').front());
if(contains(get_rocblas_fp32_archs(), device_name))
compute_fp32 = true;
#endif
return compute_fp32;
return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
}
bool get_int8_x4_format(context& ctx)
{
bool int8_x4_format = true;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
#if ROCBLAS_VERSION_MAJOR >= 3
(void)(ctx);
return false;
#else
// int8x4 packed format is only available starting from rocblas-v2.38 and it is deprecated in
// v3.0 and will be removed in v4.0
rocblas_gemm_flags flag;
rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
return flag == rocblas_gemm_flags_pack_int8x4;
#endif
return int8_x4_format;
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -57,6 +57,7 @@
#include <migraphx/gpu/concat_gpu_opt.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/fuse_ck.hpp>
#include <migraphx/gpu/fuse_mlir.hpp>
#include <migraphx/gpu/fuse_ops.hpp>
#include <migraphx/gpu/prefuse_ops.hpp>
......@@ -72,9 +73,12 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
#ifndef _WIN32
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
#endif
struct id_pass
{
std::string name() const { return "id"; }
......@@ -98,16 +102,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
unsupported_types.erase(shape::type_t::bool_type);
unsupported_types.erase(shape::type_t::int8_type);
unsupported_types.erase(shape::type_t::uint8_type);
unsupported_types.erase(shape::type_t::int32_type);
unsupported_types.erase(shape::type_t::tuple_type);
// clang-format off
return
{
enable_pass(options.split_single_dyn_dim, split_single_dyn_dim{}),
enable_pass(options.split_single_dyn_dim, dead_code_elimination{}),
split_single_dyn_dim{},
dead_code_elimination{},
normalize_ops{},
dead_code_elimination{},
simplify_qdq{},
rewrite_quantization{},
enable_pass(not mlir_enabled(), rewrite_quantization{}),
dead_code_elimination{},
eliminate_data_type{unsupported_types, shape::type_t::float_type},
simplify_reshapes{},
......@@ -121,7 +126,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
inline_module{},
rewrite_pooling{},
dead_code_elimination{},
rewrite_gelu{},
enable_pass(options.fast_math, rewrite_gelu{}),
optimize_module{},
enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
dead_code_elimination{},
......@@ -129,11 +134,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{},
auto_contiguous{},
optimize_module{},
enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
fuse_pointwise{},
dead_code_elimination{},
enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
dead_code_elimination{},
fuse_mlir{&ctx},
#ifndef _WIN32
enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
#endif
dead_code_elimination{},
enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
dead_code_elimination{},
lowering{&ctx, options.offload_copy},
eliminate_contiguous{"gpu::contiguous"},
......@@ -150,7 +159,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{},
adjust_allocation{gpu_allocation_model{}},
dead_code_elimination{},
compile_ops{&ctx},
compile_ops{&ctx, options.exhaustive_tune},
dead_code_elimination{},
promote_literals{},
dead_code_elimination{},
......
......@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/time_op.hpp>
#include <migraphx/context.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/time.hpp>
......@@ -30,12 +30,11 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsigned long seed = 0)
{
std::vector<argument> args;
std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](auto& s) {
std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](const auto& s) {
return to_gpu(generate_argument(s, seed++));
});
return args;
......@@ -69,7 +68,6 @@ time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
return std::make_pair(host_time / n, device_time / n);
}
} // namespace driver
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -37,6 +37,8 @@ target_link_libraries(migraphx_ref PUBLIC migraphx)
target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)
migraphx_generate_export_header(migraphx_ref)
rocm_install_targets(
TARGETS migraphx_ref
INCLUDE
......
......@@ -25,6 +25,7 @@
#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
#include <migraphx/config.hpp>
#include <migraphx/ref/export.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -24,14 +24,14 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
#define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
#include <migraphx/ref/context.hpp>
#include <migraphx/program.hpp>
#include <migraphx/config.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace ref {
struct lowering
struct MIGRAPHX_REF_EXPORT lowering
{
std::string name() const { return "ref::lowering"; }
void apply(module& m) const;
......
......@@ -35,7 +35,7 @@ inline namespace MIGRAPHX_INLINE_NS {
struct pass;
namespace ref {
struct target
struct MIGRAPHX_REF_EXPORT target
{
std::string name() const;
std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
......
......@@ -27,7 +27,7 @@
#include <migraphx/dfor.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/deconvolution.hpp>
#include <migraphx/op/convolution_backwards.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/quant_dot.hpp>
......
......@@ -42,8 +42,9 @@ target_compile_options(tf-proto PRIVATE -w)
target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)
file(GLOB TF_SRCS ${CONFIGURE_DEPENDS} *.cpp)
file(GLOB TF_SRCS CONFIGURE_DEPENDS *.cpp)
add_library(migraphx_tf ${TF_SRCS})
migraphx_generate_export_header(migraphx_tf)
target_include_directories(migraphx_tf PRIVATE include)
set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)
rocm_set_soversion(migraphx_tf ${MIGRAPHX_SO_VERSION})
......
......@@ -46,6 +46,7 @@ std::vector<std::string> get_op_parsers()
op_parser_map().end(),
std::back_inserter(result),
[&](auto&& p) { return p.first; });
std::sort(result.begin(), result.end());
return result;
}
......
......@@ -52,7 +52,6 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
auto x_type = args[0]->get_shape().type();
// unsqueeze tensors of shape (C) to broadcast correctly
auto rt = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
auto scale_unsqueeze =
......@@ -64,11 +63,11 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
auto var_unsqueeze =
info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);
auto numer = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
auto denom = info.add_broadcastable_binary_op("pow", var_eps, rt);
auto div0 = info.add_broadcastable_binary_op("div", numer, denom);
auto r0 = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
auto rsqrt = info.add_instruction(make_op("rsqrt"), var_eps);
auto mul0 = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
auto r0 = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
}
};
......
......@@ -22,6 +22,7 @@
* THE SOFTWARE.
*/
#include <migraphx/tf/tf_parser.hpp>
#include <migraphx/tf/op_parser.hpp>
#include <iostream>
#include <fstream>
#include <unordered_map>
......@@ -62,5 +63,7 @@ program parse_tf(const std::string& name, const tf_options& options)
return std::move(parser.prog);
}
std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -338,7 +338,7 @@ void tf_parser::parse_node(const std::string& name)
std::string input_name = input;
// if input has trailing `:0` index then remove it
auto multi_out_idx = input.find(':');
if(multi_out_idx != std::string::npos && input.substr(multi_out_idx + 1) == "0")
if(multi_out_idx != std::string::npos and input.substr(multi_out_idx + 1) == "0")
{
input_name = input.substr(0, multi_out_idx);
}
......
......@@ -28,6 +28,7 @@
#include <migraphx/stringutils.hpp>
#include <migraphx/value.hpp>
#include <migraphx/optional.hpp>
#include <migraphx/hash.hpp>
#include <unordered_map>
#include <utility>
......@@ -284,7 +285,7 @@ bool value::contains(const std::string& pkey) const
}
std::size_t value::size() const
{
auto* a = if_array_impl(x);
const auto* a = if_array_impl(x);
if(a == nullptr)
return 0;
return a->size();
......@@ -519,6 +520,38 @@ std::ostream& operator<<(std::ostream& os, const value& d)
return os;
}
template <class T>
std::size_t value_hash(const std::string& key, const T& x)
{
std::size_t h = hash_value(key);
hash_combine(h, x);
return h;
}
std::size_t value_hash(const std::string& key, std::nullptr_t) { return hash_value(key); }
std::size_t value_hash(const std::string& key, const std::vector<value>& x)
{
std::size_t h = hash_value(key);
for(const auto& v : x)
hash_combine(h, v);
return h;
}
std::size_t value_hash(const std::string& key, const value::binary& x)
{
std::size_t h = hash_value(key);
for(const auto& v : x)
hash_combine(h, v);
return h;
}
std::size_t value::hash() const
{
std::size_t h = 0;
this->visit_value([&](const auto& a) { h = value_hash(this->get_key(), a); });
return h;
}
void value::debug_print(bool show_type) const
{
if(show_type)
......
......@@ -35,7 +35,7 @@ bool verify_args(const std::string& name,
bool passed = true;
visit_all(ref_arg, target_arg)([&](auto ref, auto target) {
double error;
passed = verify_range(ref, target, tolerance, &error);
passed = verify::verify_range(ref, target, tolerance, &error);
if(not passed)
{
// TODO: Check for nans
......@@ -45,27 +45,27 @@ bool verify_args(const std::string& name,
std::cout << "ref:" << ref << std::endl;
if(target.size() < 32)
std::cout << "target:" << target << std::endl;
if(range_zero(ref))
if(verify::range_zero(ref))
std::cout << "Ref data is all zeros" << std::endl;
if(range_zero(target))
if(verify::range_zero(target))
std::cout << "Target data is all zeros" << std::endl;
auto mxdiff = max_diff(ref, target);
auto mxdiff = verify::max_diff(ref, target);
std::cout << "Max diff: " << mxdiff << std::endl;
auto idx = mismatch_idx(ref, target, float_equal);
if(idx < range_distance(ref))
auto idx = verify::mismatch_idx(ref, target, float_equal);
if(idx < verify::range_distance(ref))
{
std::cout << "Mismatch at " << idx << ": " << ref[idx] << " != " << target[idx]
<< std::endl;
}
auto ref_nan_idx = find_idx(ref, not_finite);
auto ref_nan_idx = find_idx(ref, verify::not_finite);
if(ref_nan_idx >= 0)
std::cout << "Non finite number found in ref at " << ref_nan_idx << ": "
<< ref[ref_nan_idx] << std::endl;
auto target_nan_idx = find_idx(target, not_finite);
auto target_nan_idx = find_idx(target, verify::not_finite);
if(target_nan_idx >= 0)
std::cout << "Non finite number found in target at " << target_nan_idx << ": "
<< target[target_nan_idx] << std::endl;
......@@ -73,27 +73,27 @@ bool verify_args(const std::string& name,
}
else
{
if(range_zero(ref))
if(verify::range_zero(ref))
std::cout << "Ref data is all zeros" << std::endl;
if(range_zero(target))
if(verify::range_zero(target))
std::cout << "Target data is all zeros" << std::endl;
// auto mxdiff = max_diff(ref, target);
// std::cout << "Max diff: " << mxdiff << std::endl;
// auto idx = mismatch_idx(ref, target, float_equal);
// if(idx < range_distance(ref))
// if(idx < verify::range_distance(ref))
// {
// std::cout << "Mismatch at " << idx << ": " << ref[idx] << " != " << target[idx]
// << std::endl;
// }
auto ref_nan_idx = find_idx(ref, not_finite);
auto ref_nan_idx = find_idx(ref, verify::not_finite);
if(ref_nan_idx >= 0)
std::cout << "Non finite number found in ref at " << ref_nan_idx << ": "
<< ref[ref_nan_idx] << std::endl;
auto target_nan_idx = find_idx(target, not_finite);
auto target_nan_idx = find_idx(target, verify::not_finite);
if(target_nan_idx >= 0)
std::cout << "Non finite number found in target at " << target_nan_idx << ": "
<< target[target_nan_idx] << std::endl;
......
......@@ -24,8 +24,6 @@
cmake_policy(SET CMP0057 NEW)
include(CTest)
find_package(Threads REQUIRED)
include(ProcessorCount)
ProcessorCount(N)
......@@ -100,21 +98,15 @@ endfunction()
function(add_test_executable TEST_NAME)
add_executable(${TEST_NAME} EXCLUDE_FROM_ALL ${ARGN})
target_link_libraries(${TEST_NAME} ${CMAKE_THREAD_LIBS_INIT})
# Cmake does not add flags correctly for gcc
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread)
endif()
set(TEST_COMMAND ${TEST_NAME})
add_test_command(${TEST_NAME} ${TEST_COMMAND})
add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME})
target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_ref)
target_link_libraries(${TEST_NAME} Threads::Threads migraphx migraphx_onnx migraphx_ref)
target_include_directories(${TEST_NAME} PUBLIC include)
endfunction(add_test_executable)
file(GLOB TESTS ${CONFIGURE_DEPENDS} *.cpp)
file(GLOB TESTS CONFIGURE_DEPENDS *.cpp)
foreach(TEST ${TESTS})
get_filename_component(BASE_NAME ${TEST} NAME_WE)
......@@ -124,7 +116,7 @@ endforeach()
if(MIGRAPHX_ENABLE_GPU)
# gpu tests
file(GLOB GPU_TESTS ${CONFIGURE_DEPENDS} gpu/*.cpp)
file(GLOB GPU_TESTS CONFIGURE_DEPENDS gpu/*.cpp)
foreach(TEST ${GPU_TESTS})
get_filename_component(BASE_NAME ${TEST} NAME_WE)
......@@ -134,13 +126,16 @@ if(MIGRAPHX_ENABLE_GPU)
COST 10
RESOURCE_LOCK gpu
)
if(MIGRAPHX_USE_HIPRTC)
target_compile_definitions(test_gpu_${BASE_NAME} PUBLIC -DMIGRAPHX_USE_HIPRTC)
endif()
target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
endforeach()
endif()
if(MIGRAPHX_ENABLE_FPGA)
# fpga tests
file(GLOB FPGA_TESTS ${CONFIGURE_DEPENDS} fpga/*.cpp)
file(GLOB FPGA_TESTS CONFIGURE_DEPENDS fpga/*.cpp)
foreach(TEST ${FPGA_TESTS})
get_filename_component(BASE_NAME ${TEST} NAME_WE)
......@@ -187,12 +182,36 @@ if(MIGRAPHX_ENABLE_PYTHON)
add_subdirectory(py)
endif()
# multitarget test
if(MIGRAPHX_ENABLE_GPU AND MIGRAPHX_ENABLE_CPU AND MIGRAPHX_ENABLE_FPGA)
set(TEST_MULTI_TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR}/multi_target)
file(GLOB MULTI_TARGET_TESTS CONFIGURE_DEPENDS ${TEST_MULTI_TARGET_DIR}/*.cpp)
foreach(MULTI_TARGET_TEST ${MULTI_TARGET_TESTS})
get_filename_component(BASE_NAME ${MULTI_TARGET_TEST} NAME_WE)
set(TEST_NAME test_${BASE_NAME})
add_executable(${TEST_NAME} ${MULTI_TARGET_TEST})
rocm_clang_tidy_check(${TEST_NAME})
target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_tf migraphx_all_targets)
target_include_directories(${TEST_NAME} PUBLIC include)
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> WORKING_DIRECTORY ${TEST_MULTI_TARGET_DIR})
add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME})
endforeach()
endif()
function(test_header NAME HEADER)
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
"#include <${HEADER}>\nint main() {}\n"
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp "
#include <${HEADER}>
int main() {}\n"
)
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp
"#include <${HEADER}>\n"
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp "
#include <${HEADER}>
#if defined(min) || defined(max) || defined(near) || defined(far)
#error \"Do not include windows.h in header files\"
#endif
\n"
)
add_test_executable(${NAME}
${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
......@@ -201,14 +220,14 @@ function(test_header NAME HEADER)
endfunction()
function(test_headers PREFIX)
file(GLOB HEADERS ${CONFIGURE_DEPENDS} ${ARGN})
file(GLOB HEADERS CONFIGURE_DEPENDS ${ARGN})
foreach(HEADER ${HEADERS})
file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})
string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)
get_filename_component(BASE_NAME ${HEADER} NAME_WE)
test_header(header_${TEST_NAME} ${PREFIX}/${BASE_NAME}.hpp)
target_link_libraries(header_${TEST_NAME} migraphx_all_targets)
target_link_libraries(header_${TEST_NAME} migraphx migraphx_onnx migraphx_tf migraphx_all_targets)
endforeach()
endfunction()
......@@ -225,3 +244,4 @@ if(MIGRAPHX_ENABLE_FPGA)
test_headers(migraphx/fpga ${CMAKE_SOURCE_DIR}/src/targets/fpga/include/migraphx/fpga/*.hpp)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment