Commit 7d344568 authored by charlie's avatar charlie
Browse files

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_dim_onnx_parser

parents 69e4955e e16faac2
...@@ -20,7 +20,6 @@ struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy> ...@@ -20,7 +20,6 @@ struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy>
return inputs.at(1); return inputs.at(1);
} }
argument argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{ {
argument result = get_arg(args, args.size() - 1); argument result = get_arg(args, args.size() - 1);
......
...@@ -26,7 +26,6 @@ struct cpu_gather : auto_register_op<cpu_gather> ...@@ -26,7 +26,6 @@ struct cpu_gather : auto_register_op<cpu_gather>
} }
argument argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{ {
std::size_t nelements = output_shape.elements(); std::size_t nelements = output_shape.elements();
......
...@@ -323,7 +323,6 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>> ...@@ -323,7 +323,6 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
return {s.type(), s.lens()}; return {s.type(), s.lens()};
} }
argument argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{ {
argument result = get_arg(args, args.size() - 1); argument result = get_arg(args, args.size() - 1);
...@@ -362,7 +361,6 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>> ...@@ -362,7 +361,6 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
} }
argument argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{ {
argument result = get_arg(args, args.size() - 1); argument result = get_arg(args, args.size() - 1);
......
...@@ -131,6 +131,7 @@ add_library(migraphx_gpu ...@@ -131,6 +131,7 @@ add_library(migraphx_gpu
clip.cpp clip.cpp
code_object_op.cpp code_object_op.cpp
compile_ops.cpp compile_ops.cpp
compile_gen.cpp
compile_hip.cpp compile_hip.cpp
compile_hip_code_object.cpp compile_hip_code_object.cpp
compiler.cpp compiler.cpp
......
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/shape.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/stringutils.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace gen {
static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
{
// If all inputs are half then only use half2
if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
return s.type() == shape::half_type;
}))
return {2};
return {4, 2};
}
vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
{
auto sizes = vector_sizes(inputs);
std::vector<std::size_t> max_vec_size;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(max_vec_size),
[&](const auto& input) -> std::size_t {
auto stride = input.strides()[axis];
auto len = input.lens()[axis];
if(stride != 0 and stride != 1)
return 1;
if(len == 1 and input.elements() > sizes.front())
return sizes.front();
auto it = std::find_if(
sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
if(it != sizes.end())
return *it;
return 1;
});
return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
}
std::string vectorize::str() const
{
return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
}
preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
{
const std::size_t max_lds_bytes = 4096;
std::vector<bool> result;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(result),
[&](const shape& input) { return input.strides()[axis] == 0; });
auto bytes = std::inner_product(inputs.begin(),
inputs.end(),
result.begin(),
std::size_t{0},
std::plus<>{},
[](const shape& s, bool b) -> std::size_t {
if(b)
return s.bytes();
return 0;
});
if(bytes < max_lds_bytes)
return {result};
// TODO: Try to partially preload items
std::fill(result.begin(), result.end(), false);
return {result};
}
std::string preload::str() const
{
std::vector<std::string> bool_strs;
std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
if(b)
return "true";
return "false";
});
return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
}
bool preload::is_preloading() const
{
return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
}
std::size_t find_fast_axis(const std::vector<shape>& inputs)
{
auto permutation = find_permutation(inputs);
auto it = std::max_element(permutation.begin(), permutation.end());
return it - permutation.begin();
}
std::string make_transformer_args(std::vector<std::string> transformers)
{
return join_strings(std::move(transformers), ", ");
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -134,7 +134,6 @@ struct hiprtc_program ...@@ -134,7 +134,6 @@ struct hiprtc_program
std::vector<char> buffer(n); std::vector<char> buffer(n);
MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data())); MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
assert(buffer.back() == 0); assert(buffer.back() == 0);
// cppcheck-suppress returnDanglingLifetime
return {buffer.begin(), buffer.end() - 1}; return {buffer.begin(), buffer.end() - 1};
} }
......
...@@ -119,8 +119,21 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over) ...@@ -119,8 +119,21 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
}; };
} }
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
{
size_t block_size = 128;
while(block_size <= max_block_size and block_size <= n)
block_size *= 2;
return block_size / 2;
}
operation compile_hip_code_object(const std::string& content, hip_compile_options options) operation compile_hip_code_object(const std::string& content, hip_compile_options options)
{ {
assert(options.global > 0);
assert(options.local > 0);
assert(not options.inputs.empty());
assert(options.inputs.size() == options.virtual_inputs.size() or
options.virtual_inputs.empty());
std::vector<src_file> srcs; std::vector<src_file> srcs;
std::transform(migraphx_kernels().begin(), std::transform(migraphx_kernels().begin(),
migraphx_kernels().end(), migraphx_kernels().end(),
......
...@@ -681,7 +681,7 @@ struct miopen_fusion ...@@ -681,7 +681,7 @@ struct miopen_fusion
struct miopen_conv_bias struct miopen_conv_bias
{ {
op::convolution op; op::convolution op;
fusion f = {}; fusion fp = {};
fusion::op_t conv = {}; fusion::op_t conv = {};
fusion::op_t bias = {}; fusion::op_t bias = {};
...@@ -705,19 +705,19 @@ struct miopen_conv_bias ...@@ -705,19 +705,19 @@ struct miopen_conv_bias
float beta = 0; float beta = 0;
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit()); miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit()); miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
return f.execute(ctx, fargs, args[0], args[4]); return fp.execute(ctx, fargs, args[0], args[4]);
} }
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs) void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{ {
f = fusion(inputs[0]); fp = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]); conv = fp.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]); bias = fp.create_bias(inputs[3]);
if(not f.compile(ctx)) if(not fp.compile(ctx))
MIGRAPHX_THROW("Failed to compile fusion plan"); MIGRAPHX_THROW("Failed to compile fusion plan");
} }
shape get_workspace(context& ctx) { return f.get_workspace(ctx); } shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
...@@ -728,7 +728,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias) ...@@ -728,7 +728,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
struct miopen_conv_bias_relu struct miopen_conv_bias_relu
{ {
op::convolution op; op::convolution op;
fusion f = {}; fusion fp = {};
fusion::op_t conv = {}; fusion::op_t conv = {};
fusion::op_t bias = {}; fusion::op_t bias = {};
fusion::op_t relu = {}; fusion::op_t relu = {};
...@@ -754,18 +754,18 @@ struct miopen_conv_bias_relu ...@@ -754,18 +754,18 @@ struct miopen_conv_bias_relu
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit()); miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit()); miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0); miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
return f.execute(ctx, fargs, args[0], args[4]); return fp.execute(ctx, fargs, args[0], args[4]);
} }
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs) void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{ {
f = fusion(inputs[0]); fp = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]); conv = fp.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]); bias = fp.create_bias(inputs[3]);
relu = f.create_relu(); relu = fp.create_relu();
f.compile(ctx); fp.compile(ctx);
} }
shape get_workspace(context& ctx) { return f.get_workspace(ctx); } shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
...@@ -875,7 +875,6 @@ struct find_conv_pointwise ...@@ -875,7 +875,6 @@ struct find_conv_pointwise
{ {
if(i.name()[0] == '@') if(i.name()[0] == '@')
continue; continue;
auto inputs = to_shapes(i.inputs());
op.ops.push_back({{i.get_operator()}}); op.ops.push_back({{i.get_operator()}});
} }
std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins}; std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
...@@ -966,7 +965,7 @@ struct find_gemm_pointwise ...@@ -966,7 +965,7 @@ struct find_gemm_pointwise
inputs.pop_back(); inputs.pop_back();
inputs.push_back(c_ins); inputs.push_back(c_ins);
inputs.push_back(gemm_ins->inputs().back()); inputs.push_back(ins->inputs().back());
gemm.beta = 1; gemm.beta = 1;
m.replace_instruction(ins, gemm, inputs); m.replace_instruction(ins, gemm, inputs);
......
...@@ -34,6 +34,10 @@ struct code_object_op ...@@ -34,6 +34,10 @@ struct code_object_op
f(self.output, "output")); f(self.output, "output"));
} }
value attributes() const { return {{"group", group()}}; }
std::string group() const { return "gpu::code_object::" + symbol_name; }
std::string name() const { return "gpu::code_object"; } std::string name() const { return "gpu::code_object"; }
shape compute_shape(std::vector<shape> inputs) const; shape compute_shape(std::vector<shape> inputs) const;
argument argument
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct shape;
namespace gpu {
namespace gen {
struct vectorize
{
std::size_t size = 1;
std::size_t axis = 0;
static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
std::string str() const;
};
struct preload
{
std::vector<bool> args = {};
static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
bool is_preloading() const;
std::string str() const;
};
std::size_t find_fast_axis(const std::vector<shape>& inputs);
std::string make_transformer_args(std::vector<std::string> transformers);
template <class... Ts>
std::string make_transformer_args(Ts... xs)
{
return make_transformer_args({xs.str()...});
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
...@@ -46,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1); ...@@ -46,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation compile_hip_code_object(const std::string& content, hip_compile_options options); operation compile_hip_code_object(const std::string& content, hip_compile_options options);
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP #define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
#include <migraphx/shape.hpp> #include <migraphx/shape.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/op/quant_convolution.hpp> #include <migraphx/op/quant_convolution.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
...@@ -14,6 +15,7 @@ struct context; ...@@ -14,6 +15,7 @@ struct context;
struct miopen_quant_convolution struct miopen_quant_convolution
{ {
op::quant_convolution op; op::quant_convolution op;
bool int8_x4_format = false;
shared<convolution_descriptor> cd; shared<convolution_descriptor> cd;
miopenConvFwdAlgorithm_t algo{}; miopenConvFwdAlgorithm_t algo{};
miopenHandle_t handle = nullptr; miopenHandle_t handle = nullptr;
...@@ -22,7 +24,8 @@ struct miopen_quant_convolution ...@@ -22,7 +24,8 @@ struct miopen_quant_convolution
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
{ {
// TODO: Add algo // TODO: Add algo
return op::quant_convolution::reflect(self.op, f); return pack_join(migraphx::reflect(self.op, f),
pack(f(self.int8_x4_format, "int8_x4_format")));
} }
std::string name() const { return "gpu::quant_convolution"; } std::string name() const { return "gpu::quant_convolution"; }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <migraphx/gpu/context.hpp> #include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp> #include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp> #include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp> #include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp> #include <migraphx/ranges.hpp>
...@@ -17,6 +18,8 @@ namespace migraphx { ...@@ -17,6 +18,8 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
using namespace migraphx::gpu::gen; // NOLINT
static const char* const pointwise_kernel = R"__migraphx__( static const char* const pointwise_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp> #include <migraphx/kernels/pointwise.hpp>
...@@ -30,7 +33,7 @@ extern "C" { ...@@ -30,7 +33,7 @@ extern "C" {
__global__ void ${kernel}(${params}) __global__ void ${kernel}(${params})
{ {
auto idx = make_index(); auto idx = make_index();
pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args}); pointwise(idx, ${transformers})(${lambda}, ${args});
} }
} }
...@@ -62,75 +65,6 @@ struct pointwise_compiler : compiler<pointwise_compiler> ...@@ -62,75 +65,6 @@ struct pointwise_compiler : compiler<pointwise_compiler>
else else
return 1; return 1;
} }
static std::size_t find_fast_axis(const std::vector<shape>& inputs)
{
auto permutation = find_permutation(inputs);
auto it = std::max_element(permutation.begin(), permutation.end());
return it - permutation.begin();
}
static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
{
const std::size_t max_lds_bytes = 4096;
std::vector<bool> result;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(result),
[&](const shape& input) { return input.strides()[axis] == 0; });
auto bytes = std::inner_product(inputs.begin(),
inputs.end(),
result.begin(),
std::size_t{0},
std::plus<>{},
[](const shape& s, bool b) -> std::size_t {
if(b)
return s.bytes();
return 0;
});
if(bytes < max_lds_bytes)
return result;
// TODO: Try to partially preload items
std::fill(result.begin(), result.end(), false);
return result;
}
static std::string preload_str(const std::vector<bool>& bs)
{
std::vector<std::string> bool_strs;
std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
if(b)
return "true";
return "false";
});
return "false, " + join_strings(bool_strs, ", ");
}
static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
{
// If all inputs is half then only use half2
if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
return s.type() == shape::half_type;
}))
return {2};
return {4, 2};
}
static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
{
auto sizes = vector_sizes(inputs);
std::vector<std::size_t> max_vec_size;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(max_vec_size),
[&](const auto& input) -> std::size_t {
auto stride = input.strides()[axis];
auto len = input.lens()[axis];
if(stride != 0 and stride != 1)
return 1;
auto it = std::find_if(
sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
if(it != sizes.end())
return *it;
return 1;
});
return *std::min_element(max_vec_size.begin(), max_vec_size.end());
}
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{ {
hip_compile_options options; hip_compile_options options;
...@@ -139,23 +73,20 @@ struct pointwise_compiler : compiler<pointwise_compiler> ...@@ -139,23 +73,20 @@ struct pointwise_compiler : compiler<pointwise_compiler>
options.virtual_inputs = reduce_dims(inputs); options.virtual_inputs = reduce_dims(inputs);
options.params = "-Wno-float-equal"; options.params = "-Wno-float-equal";
auto axis = find_fast_axis(options.virtual_inputs); auto axis = find_fast_axis(options.virtual_inputs);
auto vec_size = vectorize_elements(axis, options.virtual_inputs); auto vec = vectorize::elements(axis, options.virtual_inputs);
auto preloads = preload(axis, options.virtual_inputs); auto preloads = preload::broadcasts(axis, options.virtual_inputs);
auto is_preloading =
std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
options.kernel_name = v.get("kernel", "kernel"); options.kernel_name = v.get("kernel", "kernel");
options.set_launch_params(v, options.set_launch_params(
v,
compute_global_for(ctx, compute_global_for(ctx,
options.output.elements() / vec_size, options.output.elements() / vec.size,
oversubscribe_if(not is_preloading))); oversubscribe_if(not preloads.is_preloading())));
auto src = interpolate_string(pointwise_kernel, auto src = interpolate_string(pointwise_kernel,
{{"kernel", options.kernel_name}, {{"kernel", options.kernel_name},
{"params", enum_params(inputs.size(), "void * private_p")}, {"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")}, {"args", enum_params(inputs.size(), "private_p")},
{"lambda", v.at("lambda").to<std::string>()}, {"lambda", v.at("lambda").to<std::string>()},
{"vec_size", std::to_string(vec_size)}, {"transformers", make_transformer_args(preloads, vec)},
{"axis", std::to_string(axis)},
{"preloads", preload_str(preloads)},
{"preamble", v.get("preamble", std::string{})}}); {"preamble", v.get("preamble", std::string{})}});
return compile_hip_code_object(src, options); return compile_hip_code_object(src, options);
} }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <migraphx/gpu/context.hpp> #include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp> #include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp> #include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp> #include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp> #include <migraphx/ranges.hpp>
...@@ -16,9 +17,12 @@ namespace migraphx { ...@@ -16,9 +17,12 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
using namespace migraphx::gpu::gen; // NOLINT
static const char* const simple_reduce_kernel = R"__migraphx__( static const char* const simple_reduce_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp> #include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/reduce.hpp> #include <migraphx/kernels/reduce.hpp>
#include <migraphx/kernels/vectorize.hpp>
#include <args.hpp> #include <args.hpp>
namespace migraphx { namespace migraphx {
...@@ -26,9 +30,10 @@ namespace migraphx { ...@@ -26,9 +30,10 @@ namespace migraphx {
${preamble} ${preamble}
extern "C" { extern "C" {
__global__ void kernel(void* input_p, void* output_p) __global__ void reduce_kernel(void* input_p, void* output_p)
{ {
make_tensors()(input_p, output_p)([](auto input, auto output) {
transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write}); simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
}); });
...@@ -40,14 +45,6 @@ __global__ void kernel(void* input_p, void* output_p) ...@@ -40,14 +45,6 @@ __global__ void kernel(void* input_p, void* output_p)
)__migraphx__"; )__migraphx__";
constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024)
{
size_t block_size = 128;
while(block_size <= max_block_size and block_size <= n)
block_size *= 2;
return block_size / 2;
}
static std::size_t get_reduce_elements(const std::vector<shape>& inputs) static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
{ {
return inputs.front().elements() / inputs.back().elements(); return inputs.front().elements() / inputs.back().elements();
...@@ -101,25 +98,34 @@ struct reduce_compiler : compiler<reduce_compiler> ...@@ -101,25 +98,34 @@ struct reduce_compiler : compiler<reduce_compiler>
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{ {
hip_compile_options options; hip_compile_options options;
auto reduce_elements = get_reduce_elements(inputs); options.inputs = inputs;
auto algo = v.get("algo", get_reduce_algo(inputs)); options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs);
auto faxis = find_fast_axis({options.virtual_inputs.front()});
vectorize vec{};
// Vectorize if the axis is a reduction axis
if(options.virtual_inputs.back().lens()[faxis] == 1)
{
vec = vectorize::elements(faxis, options.virtual_inputs);
}
auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
auto nelements = options.virtual_inputs.back().elements();
auto algo = v.get("algo", get_reduce_algo(options.virtual_inputs));
if(algo == "block") if(algo == "block")
{ {
auto block_size = compute_block_size(reduce_elements, 256); auto block_size = compute_block_size(relements, 256);
options.set_launch_params( options.set_launch_params(
v, compute_global_for(ctx, inputs.back().elements() * block_size, 256), block_size); v, compute_global_for(ctx, nelements * block_size, 256), block_size);
} }
else if(algo == "lane") else if(algo == "lane")
{ {
options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements(), 256)); options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
} }
else else
{ {
MIGRAPHX_THROW("Unknown reduce algo: " + algo); MIGRAPHX_THROW("Unknown reduce algo: " + algo);
} }
options.inputs = inputs; options.kernel_name = "reduce_kernel";
options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs);
std::string identity = "[](auto x) { return x; }"; std::string identity = "[](auto x) { return x; }";
auto src = interpolate_string(simple_reduce_kernel, auto src = interpolate_string(simple_reduce_kernel,
{{"reduction", v.at("reduction").to<std::string>()}, {{"reduction", v.at("reduction").to<std::string>()},
...@@ -127,6 +133,7 @@ struct reduce_compiler : compiler<reduce_compiler> ...@@ -127,6 +133,7 @@ struct reduce_compiler : compiler<reduce_compiler>
{"read", v.get("read", identity)}, {"read", v.get("read", identity)},
{"write", v.get("write", identity)}, {"write", v.get("write", identity)},
{"algo", algo}, {"algo", algo},
{"transformers", make_transformer_args(vec)},
{"preamble", v.get("preamble", std::string{})}}); {"preamble", v.get("preamble", std::string{})}});
options.params += "-Wno-float-equal"; options.params += "-Wno-float-equal";
return compile_hip_code_object(src, options); return compile_hip_code_object(src, options);
......
...@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun, ...@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun,
void* kernargs, void* kernargs,
std::size_t size) std::size_t size)
{ {
assert(global > 0);
assert(local > 0);
void* config[] = { void* config[] = {
// HIP_LAUNCH_PARAM_* are macros that do horrible things // HIP_LAUNCH_PARAM_* are macros that do horrible things
#ifdef MIGRAPHX_USE_CLANG_TIDY #ifdef MIGRAPHX_USE_CLANG_TIDY
......
...@@ -163,9 +163,12 @@ struct block ...@@ -163,9 +163,12 @@ struct block
__device__ auto reduce(Op op, T init, Read read) const __device__ auto reduce(Op op, T init, Read read) const
{ {
return sliced(slicer, [=](auto x, auto... xs) { return sliced(slicer, [=](auto x, auto... xs) {
return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) { return vec_reduce(block_reduce(idx,
return read(x[j], xs[j]...); op,
}); init,
x.get_shape().elements(),
[&](auto j) { return read(x[j], xs[j]...); }),
op);
}); });
} }
......
...@@ -146,5 +146,19 @@ constexpr auto vec_packed_transform(Ts... xs) ...@@ -146,5 +146,19 @@ constexpr auto vec_packed_transform(Ts... xs)
}; };
} }
template <class T, class Op>
constexpr auto vec_reduce(T x, Op op)
{
if constexpr(vec_size<T>() < 2)
return x;
else
{
vec_type<T> result = x[0];
for(int i = 1; i < vec_size<T>(); i++)
result = op(result, x[i]);
return result;
}
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
...@@ -213,7 +213,9 @@ template <index_int N, index_int Axis, class T> ...@@ -213,7 +213,9 @@ template <index_int N, index_int Axis, class T>
__device__ __host__ auto vectorize_tensor(T x) __device__ __host__ auto vectorize_tensor(T x)
{ {
constexpr auto shape = get_shape_c<T>{}; constexpr auto shape = get_shape_c<T>{};
if constexpr(shape.strides[Axis] == 0) if constexpr(shape.lens[Axis] == 1)
return x;
else if constexpr(shape.strides[Axis] == 0)
return tensor_step<N>(x, _c<Axis>); return tensor_step<N>(x, _c<Axis>);
else else
return as_vec<N>(x, _c<Axis>); return as_vec<N>(x, _c<Axis>);
......
...@@ -365,8 +365,22 @@ struct miopen_apply ...@@ -365,8 +365,22 @@ struct miopen_apply
{ {
apply_map.emplace("quant_convolution", [=](instruction_ref ins) { apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
auto&& op = any_cast<op::quant_convolution>(ins->get_operator()); auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
auto conv = miopen_quant_convolution{op, make_conv(op)}; shape ws;
auto ws = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs())); miopen_quant_convolution conv;
auto compile_quant_conv_with_format = [&](bool format) {
conv = miopen_quant_convolution{op, format, make_conv(op)};
ws = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
};
try
{
compile_quant_conv_with_format(int8_x4_format);
}
catch(migraphx::exception&)
{
// In case no solver supports the default format, retry using the other format.
compile_quant_conv_with_format(!int8_x4_format);
}
auto args = ins->inputs(); auto args = ins->inputs();
auto workspace = insert_allocation(ins, ws, "workspace"); auto workspace = insert_allocation(ins, ws, "workspace");
......
...@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset) ...@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
auto pad_k = (k + 3) / 4 * 4; auto pad_k = (k + 3) / 4 * 4;
auto pad_lens = lens; auto pad_lens = lens;
pad_lens[lens.size() + offset] = pad_k; pad_lens[lens.size() + offset] = pad_k;
std::vector<int64_t> pad_dims(lens.size() * 2, 0);
auto ret_ins = ins; auto ret_ins = ins;
if(pad_k != k) if(pad_k != k)
{ {
std::vector<int64_t> pad_dims(lens.size() * 2, 0);
pad_dims[lens.size() + offset] = pad_k - k; pad_dims[lens.size() + offset] = pad_k - k;
shape ps{s.type(), pad_lens}; shape ps{s.type(), pad_lens};
auto ins_out = auto ins_out =
...@@ -118,7 +118,7 @@ void pack_int8_args::apply(module& m) const ...@@ -118,7 +118,7 @@ void pack_int8_args::apply(module& m) const
assert(val.contains("int8_x4_format")); assert(val.contains("int8_x4_format"));
if(not val.at("int8_x4_format").to<bool>()) if(not val.at("int8_x4_format").to<bool>())
{ {
return; continue;
} }
auto inputs = ins->inputs(); auto inputs = ins->inputs();
auto lens = inputs.at(0)->get_shape().lens(); auto lens = inputs.at(0)->get_shape().lens();
...@@ -156,6 +156,12 @@ void pack_int8_args::apply(module& m) const ...@@ -156,6 +156,12 @@ void pack_int8_args::apply(module& m) const
} }
else if(ins->name() == "gpu::quant_convolution") else if(ins->name() == "gpu::quant_convolution")
{ {
auto val = ins->get_operator().to_value();
if(not val.at("int8_x4_format").to<bool>())
{
continue;
}
auto inputs = ins->inputs(); auto inputs = ins->inputs();
auto packed_x = m.insert_instruction( auto packed_x = m.insert_instruction(
ins, ins,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment