Unverified Commit 0662a9a3 authored by Brian Pickrell's avatar Brian Pickrell Committed by GitHub
Browse files

Merge branch 'develop' into dyn_resize_gather

parents b74d3a8f 35e5298e
/* /*
* The MIT License (MIT) * The MIT License (MIT)
* *
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a copy * Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal * of this software and associated documentation files (the "Software"), to deal
...@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial> ...@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
const onnx_parser::node_info& info, const onnx_parser::node_info& info,
std::vector<instruction_ref> args) const std::vector<instruction_ref> args) const
{ {
if(args.empty())
MIGRAPHX_THROW("PARSE_MULTINOMIAL: no arguments given");
int dtype = 6; int dtype = 6;
if(contains(info.attributes, "dtype")) if(contains(info.attributes, "dtype"))
dtype = info.attributes.at("dtype").i(); dtype = info.attributes.at("dtype").i();
...@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial> ...@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial>
size_t sample_size = 1; size_t sample_size = 1;
if(contains(info.attributes, "sample_size")) if(contains(info.attributes, "sample_size"))
sample_size = info.attributes.at("sample_size").i(); sample_size = info.attributes.at("sample_size").i();
else
MIGRAPHX_THROW("PARSE_MULTINOMIAL: sample_size not given");
// Use logarithmic math to scale probabilities while avoiding division by very
// small numbers. Scaling by the maximum makes very tiny ranges more
// tractable; any constant factor gives equivalent distr. since the Multinomial op.
// normalizes at runtime.
// Subtract the per-batch maximum log-probability, making the per-batch max 0 // Subtract the per-batch maximum log-probability, making the per-batch max 0
auto maxes = auto maxes =
info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]); info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
auto mb_maxes = info.add_instruction( auto cdf = info.add_common_op("sub", args[0], maxes);
migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
maxes);
auto cdf = info.add_instruction(migraphx::make_op("sub"), args[0], mb_maxes);
// Take the element-wise exponent to get probabilities in the range (0, 1] // Take the element-wise exponent to get probabilities in the range (0, 1]
cdf = info.add_instruction(migraphx::make_op("exp"), cdf); cdf = info.add_instruction(migraphx::make_op("exp"), cdf);
// Compute the cumulative density function // Compute the cumulative distribution function
cdf = info.add_instruction( cdf = info.add_instruction(
migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf); migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
// Pre-compute random distribution instruction_ref seed_input;
std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
if(contains(info.attributes, "seed")) if(contains(info.attributes, "seed"))
gen.seed(info.attributes.at("seed").f()); {
float seed = info.attributes.at("seed").f();
migraphx::shape s{migraphx::shape::float_type, {1}};
std::vector<float> data = {seed};
seed_input = info.add_literal(migraphx::literal(s, data));
}
else
{
seed_input = info.add_instruction(migraphx::make_op("random_seed"));
}
instruction_ref randoms;
shape s0 = args[0]->get_shape();
if(s0.dynamic())
{
// Dynamic batch_size will be taken from args[0]. The input argument to this should
// have a second dimension of sample_size.
std::vector<shape::dynamic_dimension> dyn_dim_set;
dyn_dim_set.emplace_back(s0.dyn_dims().front());
dyn_dim_set.emplace_back(shape::dynamic_dimension{sample_size, sample_size});
// read the input dimensions
auto dim_of =
info.add_instruction(migraphx::make_op("dimensions_of", {{"end", 2}}), args[0]);
// The next two operations insert the value sample_size into the second array position
// make an argument of (1, 0)
shape s(shape::int64_type, {2});
std::vector<int64_t> data1{1, 0};
auto l1 = info.add_literal(s, data1);
auto batch_arg = info.add_instruction(migraphx::make_op("mul"), dim_of, l1);
std::vector<int64_t> data2(2, 0);
// make an argument of (0, sample_size)
data2[1] = sample_size;
auto l2 = info.add_literal(s, data2);
auto alloc_shape = info.add_instruction(migraphx::make_op("add"), batch_arg, l2);
// alloc_shape should contain the input-based shape dimensions as its values at runtime,
// and its own shape is {2}
// compile_shape is the shape used when compiling the Allocate op, and may be dynamic
migraphx::shape compile_shape =
migraphx::shape(s0.type(), {s0.dyn_dims().front(), {sample_size, sample_size}});
std::uniform_real_distribution<> dis(0.0, 1.0); // Allocate on-device storage for the random values
size_t batch_size = args[0]->get_shape().lens().front(); auto alloc = info.add_instruction(
migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}}; migraphx::make_op("allocate", {{"shape", to_value(compile_shape)}}), alloc_shape);
randoms = info.add_instruction(migraphx::make_op("random_uniform"), seed_input, alloc);
}
else
{
// use literal. The array populated by random_uniform may have any shape, as long its
// number of elements is batch_size * sample_size .
size_t batch_size = s0.lens().front();
auto rand_dummy = info.add_literal(
migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
std::vector<float> random_dist(batch_size * sample_size); randoms =
std::generate(random_dist.begin(), random_dist.end(), [&]() { return dis(gen); }); info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
auto dist_lit = info.add_literal(migraphx::literal{dist_shape, random_dist}); }
return info.add_instruction( return info.add_instruction(
migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, dist_lit); migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, randoms);
} }
}; };
......
...@@ -181,6 +181,76 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr) ...@@ -181,6 +181,76 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr)
return nearest_mode; return nearest_mode;
} }
static std::vector<double> get_scales(const onnx_parser::attribute_map& attr)
{
std::vector<double> scales;
if(contains(attr, "scales"))
{
copy(attr.at("scales").floats(), std::back_inserter(scales));
}
return scales;
}
static void parse_args(const std::vector<instruction_ref>& args,
const std::vector<size_t>& in_lens,
const std::string& op_name,
std::vector<double>& vec_scale,
std::vector<std::size_t>& out_lens)
{
for(const auto& arg : args)
{
if(arg->name() == "undefined" or arg == args.front())
{
continue;
}
// skipped empty input
auto lens = arg->get_shape().lens();
if(lens.empty())
{
continue;
}
auto type = arg->get_shape().type();
// output size
if(type == shape::int64_type)
{
auto arg_out_s = arg->eval();
check_arg_empty(arg_out_s,
"PARSE_" + op_name + ": dynamic output size is not supported!");
arg_out_s.visit([&](const auto& ol) { out_lens.assign(ol.begin(), ol.end()); });
if(out_lens.size() != in_lens.size())
{
MIGRAPHX_THROW("PARSE_" + op_name +
": specified output size does not match input size");
}
// compute the scale
vec_scale.resize(in_lens.size());
std::transform(in_lens.begin(),
in_lens.end(),
out_lens.begin(),
vec_scale.begin(),
[](auto iss, auto oss) { return 1.0 * oss / iss; });
}
else
{
// scale input
if(lens[0] == in_lens.size())
{
auto arg_scale = arg->eval();
check_arg_empty(arg_scale,
"PARSE_" + op_name + ": dynamic input scale is not supported!");
arg_scale.visit([&](const auto& v) { vec_scale.assign(v.begin(), v.end()); });
}
}
}
}
struct parse_resize : op_parser<parse_resize> struct parse_resize : op_parser<parse_resize>
{ {
std::vector<op_desc> operators() const { return {{"Resize"}, {"Upsample"}}; } std::vector<op_desc> operators() const { return {{"Resize"}, {"Upsample"}}; }
...@@ -309,7 +379,7 @@ struct parse_resize : op_parser<parse_resize> ...@@ -309,7 +379,7 @@ struct parse_resize : op_parser<parse_resize>
std::vector<size_t> out_lens(in_s.ndim()); std::vector<size_t> out_lens(in_s.ndim());
// scale // scale
std::vector<double> vec_scale; std::vector<double> vec_scale = get_scales(info.attributes);
// Look at inputs and infer either output size or scale, depending on input type // Look at inputs and infer either output size or scale, depending on input type
for(const auto& arg : args) for(const auto& arg : args)
...@@ -381,8 +451,6 @@ struct parse_resize : op_parser<parse_resize> ...@@ -381,8 +451,6 @@ struct parse_resize : op_parser<parse_resize>
} }
} }
// Dynamic batch: Only args[0] can have a dynamic shape, only the 0'th
// dimension--batch size--can be non-fixed, and the only resize mode allowed is "nearest"
if(args[0]->get_shape().dynamic()) if(args[0]->get_shape().dynamic())
{ {
return dynamic_nearest_parse(out_lens, vec_scale, opd, info, args); return dynamic_nearest_parse(out_lens, vec_scale, opd, info, args);
......
...@@ -46,6 +46,9 @@ struct parse_slice : op_parser<parse_slice> ...@@ -46,6 +46,9 @@ struct parse_slice : op_parser<parse_slice>
void always_insert(instruction_ref arg) { op_args.insert(op_args.begin(), arg); } void always_insert(instruction_ref arg) { op_args.insert(op_args.begin(), arg); }
/**
* Either insert argument into `this->op_args` or return the constant value of the argument
*/
std::vector<int64_t> insert(instruction_ref arg) std::vector<int64_t> insert(instruction_ref arg)
{ {
std::vector<int64_t> result; std::vector<int64_t> result;
...@@ -137,23 +140,22 @@ struct parse_slice : op_parser<parse_slice> ...@@ -137,23 +140,22 @@ struct parse_slice : op_parser<parse_slice>
sd.always_insert(args.at(0)); sd.always_insert(args.at(0));
// If axes arg is not given, the default is all of them. // If axes arg is not given, the default is all of them.
if(sd.op.axes.empty() and sd.op_args.size() < 3) if(sd.op.axes.empty() and sd.op_args.size() <= 3)
{ {
std::vector<int64_t> axes(args[0]->get_shape().ndim()); std::vector<int64_t> axes(args[0]->get_shape().ndim());
std::iota(axes.begin(), axes.end(), int64_t{0}); std::iota(axes.begin(), axes.end(), int64_t{0});
sd.op.axes = axes; sd.op.axes = axes;
} }
if(not sd.steps.empty()) if(std::any_of(sd.steps.begin(), sd.steps.end(), [](auto s) { return s != 1; }))
{ {
if(sd.op.starts.empty() or sd.op.ends.empty()) if(sd.op.starts.empty() or sd.op.ends.empty())
MIGRAPHX_THROW("PARSE_SLICE: steps and variable starts and ends is not supported"); MIGRAPHX_THROW(
"PARSE_SLICE: steps and variable starts and/or ends is not supported");
if(sd.op.axes.empty()) if(sd.op.axes.empty())
MIGRAPHX_THROW("PARSE_SLICE: steps and variable axes is not supported"); MIGRAPHX_THROW("PARSE_SLICE: steps and variable axes is not supported");
} }
assert(sd.steps.empty() or sd.steps.size() == sd.op.axes.size());
// If any axes have negative step, prepare to add a "reverse" op // If any axes have negative step, prepare to add a "reverse" op
for(auto i : range(sd.steps.size())) for(auto i : range(sd.steps.size()))
{ {
......
...@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split> ...@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split>
// no split attribute, input is equally divided // no split attribute, input is equally divided
else else
{ {
if((lens[tuned_axis] % info.num_outputs) != 0) std::size_t num_outputs = info.num_outputs;
// the num_outputs attribute seems to be redundant since we already have
// node_info::num_outputs, but we can still perform an error check
if(contains(info.attributes, "num_outputs"))
{ {
MIGRAPHX_THROW("PARSE_SPLIT: input cannot be equally divided into " + num_outputs =
std::to_string(info.num_outputs) + " splits!"); parser.parse_value(info.attributes.at("num_outputs")).at<std::size_t>();
if(num_outputs != info.num_outputs)
{
MIGRAPHX_THROW("PARSE_SPLIT: num_outputs attribute " +
std::to_string(num_outputs) +
" doesn't match actual number of outputs " +
std::to_string(info.num_outputs) + "!");
}
}
if(lens[tuned_axis] % num_outputs == 0)
{
std::size_t chunk_size = lens[tuned_axis] / num_outputs;
vec_splits.resize(num_outputs, chunk_size);
}
else
{
std::size_t chunk_size = lens[tuned_axis] / num_outputs + 1;
std::size_t last_chunk_size = lens[tuned_axis] - chunk_size * (num_outputs - 1);
vec_splits.resize(num_outputs - 1, chunk_size);
vec_splits.push_back(last_chunk_size);
} }
auto dl = lens[tuned_axis] / info.num_outputs;
vec_splits.resize(info.num_outputs, dl);
} }
if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) != if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) !=
......
...@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os, ...@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os,
os << std::endl; os << std::endl;
os << "Batch size: " << batch << std::endl; os << "Batch size: " << batch << std::endl;
os << "Rate: " << rate * batch << "inferences/sec" << std::endl; os << "Rate: " << rate * batch << " inferences/sec" << std::endl;
os << "Total time: " << total_time << "ms" << std::endl; os << "Total time: " << total_time << "ms" << std::endl;
os << "Total instructions time: " << total_instruction_time << "ms" << std::endl; os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
os << "Overhead time: " << overhead_time << "ms" os << "Overhead time: " << overhead_time << "ms"
......
...@@ -472,7 +472,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m) ...@@ -472,7 +472,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
map_dyn_input_dims, map_dyn_input_dims,
bool skip_unknown_operators, bool skip_unknown_operators,
bool print_program_on_error, bool print_program_on_error,
int64_t max_loop_iterations) { int64_t max_loop_iterations,
int64_t limit_max_iterations) {
migraphx::onnx_options options; migraphx::onnx_options options;
options.default_dim_value = default_dim_value; options.default_dim_value = default_dim_value;
options.default_dyn_dim_value = default_dyn_dim_value; options.default_dyn_dim_value = default_dyn_dim_value;
...@@ -481,6 +482,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m) ...@@ -481,6 +482,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
options.skip_unknown_operators = skip_unknown_operators; options.skip_unknown_operators = skip_unknown_operators;
options.print_program_on_error = print_program_on_error; options.print_program_on_error = print_program_on_error;
options.max_loop_iterations = max_loop_iterations; options.max_loop_iterations = max_loop_iterations;
options.limit_max_iterations = limit_max_iterations;
return migraphx::parse_onnx(filename, options); return migraphx::parse_onnx(filename, options);
}, },
"Parse onnx file", "Parse onnx file",
...@@ -492,7 +494,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m) ...@@ -492,7 +494,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
std::unordered_map<std::string, std::vector<migraphx::shape::dynamic_dimension>>(), std::unordered_map<std::string, std::vector<migraphx::shape::dynamic_dimension>>(),
py::arg("skip_unknown_operators") = false, py::arg("skip_unknown_operators") = false,
py::arg("print_program_on_error") = false, py::arg("print_program_on_error") = false,
py::arg("max_loop_iterations") = 10); py::arg("max_loop_iterations") = 10,
py::arg("limit_max_iterations") = std::numeric_limits<uint16_t>::max());
m.def( m.def(
"parse_onnx_buffer", "parse_onnx_buffer",
......
...@@ -147,8 +147,8 @@ void quantize_int8(program& prog, ...@@ -147,8 +147,8 @@ void quantize_int8(program& prog,
run_passes(prog, run_passes(prog,
{quantize_int8_pass{ins_names, *int8_quant_params}, {quantize_int8_pass{ins_names, *int8_quant_params},
optimize_module{},
simplify_qdq{}, simplify_qdq{},
optimize_module{},
dead_code_elimination{}}); dead_code_elimination{}});
} }
......
...@@ -47,7 +47,7 @@ void apply_quantizelinear(module& m, instruction_ref ins) ...@@ -47,7 +47,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)
ins, make_op("convert", {{"target_type", y_scale->get_shape().type()}}), x); ins, make_op("convert", {{"target_type", y_scale->get_shape().type()}}), x);
} }
auto div = m.insert_instruction(ins, make_op("div"), x, y_scale); auto div = m.insert_instruction(ins, make_op("div"), x, y_scale);
auto add_zero_point = m.insert_instruction(ins, make_op("round"), div); auto add_zero_point = m.insert_instruction(ins, make_op("nearbyint"), div);
if(ins->inputs().size() == 3) if(ins->inputs().size() == 3)
{ {
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <migraphx/simplify_dyn_ops.hpp> #include <migraphx/simplify_dyn_ops.hpp>
#include <migraphx/matcher.hpp> #include <migraphx/matcher.hpp>
#include <migraphx/make_op.hpp> #include <migraphx/make_op.hpp>
#include <migraphx/literal.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
...@@ -131,10 +132,53 @@ struct find_const_4in_slice ...@@ -131,10 +132,53 @@ struct find_const_4in_slice
} }
}; };
/**
* Simplify dimensions_of to a literal when the input arugment has a static shape
* or the dynamic dimensions from `start` to `end` are fixed.
*/
struct find_static_dimensions_of
{
auto matcher() const { return match::name("dimensions_of")(); }
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
auto input = ins->inputs().at(0);
auto dimensions_of_value = ins->get_operator().to_value();
auto start = dimensions_of_value.at("start").to<std::size_t>();
auto end = dimensions_of_value.at("end").to<std::size_t>();
if(input->get_shape().dynamic())
{
// check if dynamic dimensions from start to end are fixed
auto dds = input->get_shape().dyn_dims();
if(std::any_of(dds.begin() + start, dds.begin() + end, [](auto dd) {
return not dd.is_fixed();
}))
{
return;
}
}
std::size_t output_ndim = end - start;
std::vector<int64_t> vec_shape(output_ndim);
migraphx::shape s(migraphx::shape::int64_type, {output_ndim});
std::vector<std::size_t> input_lens = input->get_shape().to_static(1).lens();
std::transform(input_lens.begin() + start,
input_lens.begin() + end,
vec_shape.begin(),
[](auto i) { return int64_t(i); });
migraphx::shape output_shape{migraphx::shape::int64_type, {end - start}};
auto lit_ins = m.add_literal(migraphx::literal{output_shape, vec_shape});
m.replace_instruction(ins, lit_ins);
}
};
void simplify_dyn_ops::apply(module& m) const void simplify_dyn_ops::apply(module& m) const
{ {
match::find_matches( match::find_matches(m,
m, find_static_2in_broadcasts{}, find_const_3in_slice{}, find_const_4in_slice{}); find_static_2in_broadcasts{},
find_static_dimensions_of{},
find_const_3in_slice{},
find_const_4in_slice{});
} }
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -647,8 +647,8 @@ struct find_broadcast_transpose ...@@ -647,8 +647,8 @@ struct find_broadcast_transpose
{ {
auto transpose = r.result; auto transpose = r.result;
auto transpose_lens = transpose->get_shape().lens(); auto transpose_lens = transpose->get_shape().lens();
auto bcast_ins = r.instructions["bcast_ins"]; auto bcast_ins = r.instructions["bcast_ins"];
auto input = bcast_ins->inputs().front(); auto input = bcast_ins->inputs().front();
// scalar transformation does not need extra transpose // scalar transformation does not need extra transpose
if(not input->get_shape().scalar()) if(not input->get_shape().scalar())
{ {
......
# #################################################################################### # ####################################################################################
# The MIT License (MIT) # The MIT License (MIT)
# #
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
...@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen) ...@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen)
message(SEND_ERROR "Cant find miopen") message(SEND_ERROR "Cant find miopen")
endif() endif()
if(NOT WIN32) if(MIGRAPHX_USE_COMPOSABLEKERNEL)
# TODO: re-enable when CK is ported to Windows
find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library) find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
endif() endif()
...@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS ...@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp) ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}") message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
if(WIN32) if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
# TODO: re-enable when CK is ported to Windows
list(REMOVE_ITEM KERNEL_FILES list(REMOVE_ITEM KERNEL_FILES
${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp) ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
endif() endif()
...@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check) ...@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check)
file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp) file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
if(WIN32) if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
# TODO: re-enable when CK is ported to Windows list(REMOVE_ITEM JIT_GPU_SRCS
list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm_softmax_gemm.cpp)
endif() endif()
add_library(migraphx_gpu add_library(migraphx_gpu
...@@ -128,8 +128,6 @@ add_library(migraphx_gpu ...@@ -128,8 +128,6 @@ add_library(migraphx_gpu
gather.cpp gather.cpp
gemm_impl.cpp gemm_impl.cpp
hip.cpp hip.cpp
int8_conv_pack.cpp
int8_gemm_pack.cpp
kernel.cpp kernel.cpp
lowering.cpp lowering.cpp
logsoftmax.cpp logsoftmax.cpp
...@@ -140,7 +138,6 @@ add_library(migraphx_gpu ...@@ -140,7 +138,6 @@ add_library(migraphx_gpu
no_device.cpp no_device.cpp
nonzero.cpp nonzero.cpp
pack_args.cpp pack_args.cpp
pack_int8_args.cpp
prefuse_ops.cpp prefuse_ops.cpp
pad.cpp pad.cpp
perfdb.cpp perfdb.cpp
...@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_ ...@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_
register_migraphx_gpu_ops(miopen_ register_migraphx_gpu_ops(miopen_
abs abs
contiguous contiguous
int8_conv_pack
lrn lrn
pooling pooling
) )
...@@ -192,10 +188,6 @@ register_op(migraphx_gpu ...@@ -192,10 +188,6 @@ register_op(migraphx_gpu
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
INCLUDES migraphx/gpu/context.hpp) INCLUDES migraphx/gpu/context.hpp)
register_op(migraphx_gpu
HEADER migraphx/gpu/int8_gemm_pack.hpp
OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
INCLUDES migraphx/gpu/context.hpp)
register_op(migraphx_gpu register_op(migraphx_gpu
HEADER migraphx/gpu/gemm.hpp HEADER migraphx/gpu/gemm.hpp
OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot> OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
...@@ -239,24 +231,28 @@ else() ...@@ -239,24 +231,28 @@ else()
string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
endforeach() endforeach()
message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") message(STATUS "Hip compiler flags: \"${HIP_COMPILER_FLAGS}\"")
target_compile_definitions(migraphx_gpu PRIVATE target_compile_definitions(migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" -DMIGRAPHX_HIP_COMPILER="${CMAKE_CXX_COMPILER}"
"-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}" -DMIGRAPHX_HIP_COMPILER_FLAGS="${HIP_COMPILER_FLAGS}"
) )
if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER) if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER) execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER) string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}") target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER="${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
endif() endif()
endif() endif()
# Check miopen find mode api # Check miopen find mode api
include(CheckLibraryExists) include(CheckLibraryExists)
get_target_property(MIOPEN_LOCATION MIOpen LOCATION) get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
get_target_property(ROCBLAS_LOCATION roc::rocblas LOCATION)
check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API) check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API) check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
# Beta API for automated GEMM tuning
check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "") set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")
...@@ -279,10 +275,16 @@ else() ...@@ -279,10 +275,16 @@ else()
message(STATUS "MIOpen does not have find mode api") message(STATUS "MIOpen does not have find mode api")
endif() endif()
if(HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
message(STATUS "MIGraphx is using Beta API of rocBLAS")
else()
message(STATUS "rocBLAS does not have User Tuning Beta API")
endif()
target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas) target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels) target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
if(NOT WIN32) if(MIGRAPHX_USE_COMPOSABLEKERNEL)
# TODO: re-enable when CK is ported to Windows
target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library) target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
endif() endif()
......
...@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr ...@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
bool is_hip_clang_compiler() bool is_hip_clang_compiler()
{ {
static const auto result = ends_with(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER), "clang++"); static const auto result = fs::path{MIGRAPHX_HIP_COMPILER}.stem() == "clang++";
return result; return result;
} }
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
bool has_compiler_launcher() bool has_compiler_launcher()
{ {
static const auto result = fs::exists(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER)); static const auto result = fs::exists(MIGRAPHX_HIP_COMPILER_LAUNCHER);
return result; return result;
} }
#endif
src_compiler assemble(src_compiler compiler) src_compiler assemble(src_compiler compiler)
{ {
compiler.out_ext = ".S"; compiler.out_ext = ".S";
...@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std ...@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
{ {
assert(not srcs.empty()); assert(not srcs.empty());
if(not is_hip_clang_compiler()) if(not is_hip_clang_compiler())
MIGRAPHX_THROW("Unknown hip compiler: " + MIGRAPHX_THROW("Unknown hip compiler: " MIGRAPHX_HIP_COMPILER);
std::string(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER)));
if(params.find("-std=") == std::string::npos) if(params.find("-std=") == std::string::npos)
params += " --std=c++17"; params += " --std=c++17";
...@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std ...@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
params += " -DMIGRAPHX_DEBUG"; params += " -DMIGRAPHX_DEBUG";
params += " -Wno-unused-command-line-argument -Wno-cuda-compat "; params += " -Wno-unused-command-line-argument -Wno-cuda-compat ";
params += MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_FLAGS); params += MIGRAPHX_HIP_COMPILER_FLAGS;
src_compiler compiler; src_compiler compiler;
compiler.flags = params; compiler.flags = params;
compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER); compiler.compiler = MIGRAPHX_HIP_COMPILER;
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER #ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
if(has_compiler_launcher()) if(has_compiler_launcher())
compiler.launcher = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER); compiler.launcher = MIGRAPHX_HIP_COMPILER_LAUNCHER;
#endif #endif
if(enabled(MIGRAPHX_GPU_DUMP_SRC{})) if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
{ {
...@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std ...@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
bool hip_has_flags(const std::vector<std::string>& flags) bool hip_has_flags(const std::vector<std::string>& flags)
{ {
src_compiler compiler; src_compiler compiler;
compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER); compiler.compiler = MIGRAPHX_HIP_COMPILER;
compiler.flags = compiler.flags =
join_strings(flags, " ") + " -x hip -c --offload-arch=gfx900 --cuda-device-only"; join_strings(flags, " ") + " -x hip -c --offload-arch=gfx900 --cuda-device-only";
......
...@@ -60,9 +60,8 @@ struct miopen_op ...@@ -60,9 +60,8 @@ struct miopen_op
}; };
MIGRAPHX_REGISTER_OP(miopen_op); MIGRAPHX_REGISTER_OP(miopen_op);
std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
{ {
op.from_value({{"int8_x4_format", format}});
auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs())); auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
return v.get<std::size_t>("workspace", 0); return v.get<std::size_t>("workspace", 0);
} }
...@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for ...@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for
void compile_miopen::apply(module& m) const void compile_miopen::apply(module& m) const
{ {
assert(ctx); assert(ctx);
const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
for(auto ins : iterator_for(m)) for(auto ins : iterator_for(m))
{ {
if(ins->name() != "gpu::miopen_op") if(ins->name() != "gpu::miopen_op")
continue; continue;
auto op = any_cast<miopen_op>(ins->get_operator()).op; auto op = any_cast<miopen_op>(ins->get_operator()).op;
std::size_t ws = 0; std::size_t ws = 0;
try ws = compile(op, ins);
{ auto inputs = ins->inputs();
// for the regular convolution and convolution_backwards, this try would always succeed auto alloc = m.insert_instruction(
ws = compile(op, ins, int8_x4_format);
}
catch(migraphx::exception&)
{
// In case no solver supports the default format, retry using the other format.
ws = compile(op, ins, not int8_x4_format);
}
auto inputs = ins->inputs();
auto alloc = m.insert_instruction(
ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}})); ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
inputs.insert(std::prev(inputs.end()), alloc); inputs.insert(std::prev(inputs.end()), alloc);
......
...@@ -168,6 +168,7 @@ struct compile_plan ...@@ -168,6 +168,7 @@ struct compile_plan
} }
const compiled_result& benchmark(problem_cache& pc) const const compiled_result& benchmark(problem_cache& pc) const
{ {
const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
if(results.empty()) if(results.empty())
MIGRAPHX_THROW("No configs to tune"); MIGRAPHX_THROW("No configs to tune");
if(results.size() == 1) if(results.size() == 1)
...@@ -178,9 +179,10 @@ struct compile_plan ...@@ -178,9 +179,10 @@ struct compile_plan
} }
if(not config) if(not config)
MIGRAPHX_THROW("Multiple kernels without config"); MIGRAPHX_THROW("Multiple kernels without config");
std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs" if(trace_level > 0)
<< std::endl; std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
if(enabled(MIGRAPHX_TRACE_BENCHMARKING{})) << std::endl;
if(trace_level > 1)
std::cout << "Problem: " << config->problem << std::endl; std::cout << "Problem: " << config->problem << std::endl;
std::vector<double> times; std::vector<double> times;
times.reserve(results.size()); times.reserve(results.size());
...@@ -189,22 +191,23 @@ struct compile_plan ...@@ -189,22 +191,23 @@ struct compile_plan
config->solutions.begin(), config->solutions.begin(),
std::back_inserter(times), std::back_inserter(times),
[&](const auto& cr, const auto& solution) { [&](const auto& cr, const auto& solution) {
if(enabled(MIGRAPHX_TRACE_BENCHMARKING{})) if(trace_level > 1)
std::cout << "Benchmarking solution: " << solution << std::endl; std::cout << "Benchmarking solution: " << solution << std::endl;
if(not cr.has_value()) if(not cr.has_value())
{ {
if(enabled(MIGRAPHX_TRACE_BENCHMARKING{})) if(trace_level > 1)
std::cout << "No binary" << std::endl; std::cout << "No binary" << std::endl;
return std::numeric_limits<double>::max(); return std::numeric_limits<double>::max();
} }
auto t = time_op( auto t = time_op(
*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20); *ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20);
if(enabled(MIGRAPHX_TRACE_BENCHMARKING{})) if(trace_level > 1)
std::cout << t << "ms" << std::endl; std::cout << t << "ms" << std::endl;
return t; return t;
}); });
auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end())); auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl; if(trace_level > 0)
std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
pc.insert(preop.name(), config->problem, config->solutions.at(i)); pc.insert(preop.name(), config->problem, config->solutions.at(i));
if(not results[i].has_value()) if(not results[i].has_value())
MIGRAPHX_THROW("No valid tuned compilation."); MIGRAPHX_THROW("No valid tuned compilation.");
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/int8_gemm_pack.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/tensor.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
{
auto comp_shape = arg.get_shape();
auto out_lens = comp_shape.lens();
auto dim_0 = out_lens.size() - 2;
auto dim_1 = out_lens.size() - 1;
std::size_t lda = comp_shape.strides()[dim_0];
std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
visit_all(result, arg)([&](auto output, auto input) {
std::size_t nelements = comp_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(out_lens.size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(comp_shape);
gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
const size_t nb = 4;
auto idx = desc.multi(ii);
std::size_t i_m = idx[dim_1];
std::size_t i_k = idx[dim_0];
std::size_t offset = ii / m_size * m_size;
out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
in_ptr[i_m + i_k * lda + offset];
});
});
});
}
void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
{
auto trans_shape = arg.get_shape();
auto out_lens = trans_shape.lens();
auto dim_0 = trans_shape.lens().size() - 2;
auto dim_1 = trans_shape.lens().size() - 1;
std::size_t ldb = trans_shape.strides()[dim_1];
auto wrap_lens = out_lens;
std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
shape comp_shape{trans_shape.type(), wrap_lens};
std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
visit_all(result, arg)([&](auto output, auto input) {
std::size_t nelements = comp_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(out_lens.size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(comp_shape);
gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
const size_t nb = 4;
auto idx = desc.multi(ii);
std::size_t i_n = idx[dim_1];
std::size_t i_k = idx[dim_0];
std::size_t offset = ii / m_size * m_size;
out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
in_ptr[i_n + i_k * ldb + offset];
});
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
This diff is collapsed.
...@@ -42,7 +42,7 @@ struct compile_miopen ...@@ -42,7 +42,7 @@ struct compile_miopen
context* ctx = nullptr; context* ctx = nullptr;
std::string name() const { return "gpu::compile_miopen"; } std::string name() const { return "gpu::compile_miopen"; }
void apply(module& m) const; void apply(module& m) const;
std::size_t compile(operation& op, instruction_ref ins, bool format) const; std::size_t compile(operation& op, instruction_ref ins) const;
}; };
} // namespace gpu } // namespace gpu
......
...@@ -57,7 +57,6 @@ template <class Op> ...@@ -57,7 +57,6 @@ template <class Op>
struct miopen_convolution struct miopen_convolution
{ {
Op op; Op op;
bool int8_x4_format = false;
shared<convolution_descriptor> cd = nullptr; shared<convolution_descriptor> cd = nullptr;
miopenConvFwdAlgorithm_t algo{}; miopenConvFwdAlgorithm_t algo{};
#ifdef MIGRAPHX_HAS_FIND_2_API #ifdef MIGRAPHX_HAS_FIND_2_API
...@@ -74,7 +73,6 @@ struct miopen_convolution ...@@ -74,7 +73,6 @@ struct miopen_convolution
f(self.solution_object, "solution_object"), f(self.solution_object, "solution_object"),
#endif #endif
f(self.algo, "algo"), f(self.algo, "algo"),
f(self.int8_x4_format, "int8_x4_format"),
f(self.solution_id, "solution_id")); f(self.solution_id, "solution_id"));
} }
...@@ -94,9 +92,9 @@ struct miopen_convolution ...@@ -94,9 +92,9 @@ struct miopen_convolution
argument argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{ {
auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format); auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format); auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
auto y_desc = make_tensor(reshape_if_1d(output_shape)); auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto* miopen_stream_handle = ctx.get_stream().get_miopen(); auto* miopen_stream_handle = ctx.get_stream().get_miopen();
auto workspace_size = args[2].get_shape().bytes(); auto workspace_size = args[2].get_shape().bytes();
...@@ -162,8 +160,8 @@ struct miopen_convolution ...@@ -162,8 +160,8 @@ struct miopen_convolution
shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs) shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
{ {
shape workspace_shape{}; shape workspace_shape{};
auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format); auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format); auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape)); auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto* miopen_stream_handle = ctx.get_stream().get_miopen(); auto* miopen_stream_handle = ctx.get_stream().get_miopen();
...@@ -179,13 +177,8 @@ struct miopen_convolution ...@@ -179,13 +177,8 @@ struct miopen_convolution
workspace_shape = shape{shape::int8_type, {workspace_size}}; workspace_shape = shape{shape::int8_type, {workspace_size}};
auto x_shape = inputs[0]; const auto& x_shape = inputs[0];
auto w_shape = inputs[1]; const auto& w_shape = inputs[1];
if(int8_x4_format)
{
x_shape = pack_int8_shape(x_shape);
w_shape = pack_int8_shape(w_shape);
}
#ifdef MIGRAPHX_HAS_FIND_2_API #ifdef MIGRAPHX_HAS_FIND_2_API
{ {
...@@ -327,8 +320,8 @@ struct miopen_convolution ...@@ -327,8 +320,8 @@ struct miopen_convolution
": workspace has changed during finalization."); ": workspace has changed during finalization.");
} }
auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format); auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format); auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape)); auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(), auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
...@@ -347,21 +340,6 @@ struct miopen_convolution ...@@ -347,21 +340,6 @@ struct miopen_convolution
{ {
return shapes.size() - 1; return shapes.size() - 1;
} }
inline shape pack_int8_shape(const shape& s) const
{
if(s.type() != shape::int8_type)
{
return s;
}
auto lens = s.lens();
auto strides = s.strides();
lens[1] = (lens[1] + 3) / 4 * 4;
strides[0] = strides[1] * lens[1];
return {s.type(), lens, strides};
}
}; };
} // namespace gpu } // namespace gpu
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_a(hipStream_t stream,
const argument& result,
const argument& arg);
void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_b(hipStream_t stream,
const argument& result,
const argument& arg);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/* /*
* The MIT License (MIT) * The MIT License (MIT)
* *
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a copy * Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal * of this software and associated documentation files (the "Software"), to deal
...@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
struct context; struct context;
void blas_shape(const shape& s);
shape transpose_batch(const shape& s, unsigned trans_batch); shape transpose_batch(const shape& s, unsigned trans_batch);
void blas_shape(const shape& s);
template <class Op> template <class Op>
struct rocblas_gemm struct rocblas_gemm
...@@ -50,9 +49,9 @@ struct rocblas_gemm ...@@ -50,9 +49,9 @@ struct rocblas_gemm
Op op; Op op;
float alpha = 1; float alpha = 1;
float beta = 0; float beta = 0;
bool int8_x4_format = true;
bool compute_fp32 = false; bool compute_fp32 = false;
unsigned trans_batch = 0; unsigned trans_batch = 0;
int32_t solution_idx = 0;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
...@@ -60,9 +59,9 @@ struct rocblas_gemm ...@@ -60,9 +59,9 @@ struct rocblas_gemm
return pack_join(migraphx::reflect(self.op, f), return pack_join(migraphx::reflect(self.op, f),
pack(f(self.alpha, "alpha"), pack(f(self.alpha, "alpha"),
f(self.beta, "beta"), f(self.beta, "beta"),
f(self.int8_x4_format, "int8_x4_format"),
f(self.compute_fp32, "compute_fp32"), f(self.compute_fp32, "compute_fp32"),
f(self.trans_batch, "trans_batch"))); f(self.trans_batch, "trans_batch"),
f(self.solution_idx, "solution_idx")));
} }
std::string name() const std::string name() const
...@@ -78,6 +77,8 @@ struct rocblas_gemm ...@@ -78,6 +77,8 @@ struct rocblas_gemm
{ {
std::vector<shape> in_shapes(inputs); std::vector<shape> in_shapes(inputs);
in_shapes.pop_back(); in_shapes.pop_back();
// When input shapes are A, B, C the GEMM equation is C  =  α AB+ β C where α, β are
// scalars
check_shapes{in_shapes, *this}.has(2, 3); check_shapes{in_shapes, *this}.has(2, 3);
blas_shape(inputs[0]); blas_shape(inputs[0]);
blas_shape(inputs[1]); blas_shape(inputs[1]);
...@@ -113,17 +114,12 @@ struct rocblas_gemm ...@@ -113,17 +114,12 @@ struct rocblas_gemm
{ {
if(this->name() == "gpu::gemm") if(this->name() == "gpu::gemm")
{ {
gemm(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32); gemm_compute(ctx, output_shape, args, alpha, beta, compute_fp32, solution_idx);
} }
else else
{ {
gemm(ctx, gemm_compute(
output_shape, ctx, output_shape, args, int32_t(alpha), int32_t(beta), compute_fp32, solution_idx);
args,
int32_t(alpha),
int32_t(beta),
int8_x4_format,
compute_fp32);
} }
return args.back(); return args.back();
} }
...@@ -132,6 +128,33 @@ struct rocblas_gemm ...@@ -132,6 +128,33 @@ struct rocblas_gemm
{ {
return shapes.size() - 1; return shapes.size() - 1;
} }
void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& input_shapes)
{
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
if(enabled(MIGRAPHX_ENABLE_GEMM_TUNING{}) or ctx.get_exhaustive_tune_flag())
{
if(this->name() == "gpu::gemm")
{
solution_idx = gemm_finalize(
ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
}
else
{
solution_idx = gemm_finalize(ctx,
output_shape,
input_shapes,
int32_t(alpha),
int32_t(beta),
compute_fp32,
solution_idx);
}
}
#else
// suppress compiler warnings
(void)ctx, (void)output_shape, (void)input_shapes;
#endif
}
}; };
} // namespace gpu } // namespace gpu
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment