Commit 522da00b authored by Shucai Xiao's avatar Shucai Xiao Committed by mvermeulen
Browse files

Cpu conv softmax simplify impl (#397)

* simplify cpu implementation of the convolution, softmax, and logsoftmax

* clang format

* fix cppcheck error

* improve code coverage
parent 1f4f6d44
...@@ -30,6 +30,11 @@ struct logsoftmax ...@@ -30,6 +30,11 @@ struct logsoftmax
} }
return inputs.at(0); return inputs.at(0);
} }
auto output() const
{
return [=](auto x, auto y) { return std::log(x / y); };
}
}; };
} // namespace op } // namespace op
......
...@@ -30,6 +30,11 @@ struct softmax ...@@ -30,6 +30,11 @@ struct softmax
} }
return inputs.at(0); return inputs.at(0);
} }
auto output() const
{
return [=](auto x, auto y) { return x / y; };
}
}; };
} // namespace op } // namespace op
......
...@@ -166,61 +166,10 @@ struct cpu_lrn ...@@ -166,61 +166,10 @@ struct cpu_lrn
} }
}; };
template <class Op>
struct cpu_convolution struct cpu_convolution
{ {
op::convolution op; Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::convolution"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
auto in = input.get_shape().lens();
auto in_h = in[2];
auto in_w = in[3];
auto wei = weights.get_shape().lens();
auto wei_n = wei[0];
auto wei_c = wei[1];
auto wei_h = wei[2];
auto wei_w = wei[3];
par_dfor(output_shape.lens()[0],
output_shape.lens()[1],
output_shape.lens()[2],
output_shape.lens()[3])(
[&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
const auto start_x = i * op.stride[0] - op.padding[0];
const auto start_y = j * op.stride[1] - op.padding[1];
const auto group_id = w / (wei_n / op.group);
double acc = 0;
dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
const auto in_x = start_x + x;
const auto in_y = start_y + y;
const auto in_ch = group_id * wei_c + k;
if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
{
acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
}
});
output(o, w, i, j) = acc;
});
});
return result;
}
};
struct cpu_quant_convolution
{
op::quant_convolution op;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
...@@ -228,47 +177,45 @@ struct cpu_quant_convolution ...@@ -228,47 +177,45 @@ struct cpu_quant_convolution
return migraphx::reflect(self.op, f); return migraphx::reflect(self.op, f);
} }
std::string name() const { return "cpu::quant_convolution"; } std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); } shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const argument compute(context&, shape output_shape, std::vector<argument> args) const
{ {
argument result{output_shape}; argument result{output_shape};
auto output = result.get<int32_t>(); result.visit([&](auto output) {
visit_all(args[0], args[1])([&](auto input, auto weights) { using type = typename decltype(output)::value_type;
auto in = input.get_shape().lens(); visit_all(args[0], args[1])([&](auto input, auto weights) {
auto in_h = in[2]; auto in = input.get_shape().lens();
auto in_w = in[3]; auto in_h = in[2];
auto in_w = in[3];
auto wei = weights.get_shape().lens();
auto wei_n = wei[0]; auto wei = weights.get_shape().lens();
auto wei_c = wei[1]; auto wei_n = wei[0];
auto wei_h = wei[2]; auto wei_c = wei[1];
auto wei_w = wei[3]; auto wei_h = wei[2];
auto wei_w = wei[3];
par_dfor(output_shape.lens()[0],
output_shape.lens()[1], par_dfor(output_shape.lens()[0],
output_shape.lens()[2], output_shape.lens()[1],
output_shape.lens()[3])( output_shape.lens()[2],
[&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) { output_shape.lens()[3])(
const auto start_x = i * op.stride[0] - op.padding[0]; [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
const auto start_y = j * op.stride[1] - op.padding[1]; const auto start_x = i * op.stride[0] - op.padding[0];
const auto group_id = w / (wei_n / op.group); const auto start_y = j * op.stride[1] - op.padding[1];
const auto group_id = w / (wei_n / op.group);
int32_t acc = 0;
dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) { type acc = type{0};
const auto in_x = start_x + x; dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
const auto in_y = start_y + y; const auto in_x = start_x + x;
const auto in_ch = group_id * wei_c + k; const auto in_y = start_y + y;
if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w) const auto in_ch = group_id * wei_c + k;
{ if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
acc += static_cast<int32_t>(input(o, in_ch, in_x, in_y)) * acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
weights(w, k, x, y); });
} output(o, w, i, j) = acc;
}); });
output(o, w, i, j) = acc; });
});
}); });
return result; return result;
} }
}; };
...@@ -620,41 +567,25 @@ struct cpu_unary ...@@ -620,41 +567,25 @@ struct cpu_unary
{ {
check_shapes{inputs}.has(1); check_shapes{inputs}.has(1);
auto s = inputs.at(0); auto s = inputs.at(0);
if(s.packed()) return {s.type(), s.lens()};
{
return s;
}
else
{
return {s.type(), s.lens()};
}
} }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{ {
argument result{output_shape}; argument result{output_shape};
result.visit([&](auto output) { visit_all(result, args[0])([&](auto output, auto input) {
args[0].visit([&](auto input) { assert(input.get_shape().standard());
if(input.get_shape().standard()) std::transform(input.begin(), input.end(), output.begin(), op.fcn());
{
std::transform(input.begin(), input.end(), output.begin(), op.fcn());
}
else
{
shape_for_each(output.get_shape(), [&](const auto& idx) {
output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
});
}
});
}); });
return result; return result;
} }
}; };
template <class Op>
struct cpu_softmax struct cpu_softmax
{ {
op::softmax op; Op op;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
...@@ -662,7 +593,7 @@ struct cpu_softmax ...@@ -662,7 +593,7 @@ struct cpu_softmax
return migraphx::reflect(self.op, f); return migraphx::reflect(self.op, f);
} }
std::string name() const { return "cpu::softmax"; } std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); } shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{ {
...@@ -701,70 +632,8 @@ struct cpu_softmax ...@@ -701,70 +632,8 @@ struct cpu_softmax
for(std::size_t j = 0; j < n_dims; ++j) for(std::size_t j = 0; j < n_dims; ++j)
{ {
idx[op.axis] = j; idx[op.axis] = j;
output(idx.begin(), idx.end()) /= batch_sum[i]; output(idx.begin(), idx.end()) =
} op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
});
});
return result;
}
};
struct cpu_logsoftmax
{
op::logsoftmax op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::logsoftmax"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
auto batch_lens = output_shape.lens();
std::size_t n_dims = batch_lens[op.axis];
batch_lens[op.axis] = 1;
shape batch_shape{shape::int32_type, batch_lens};
// use a parallel implementation to acheive better performance
// one thread for one batch
visit_all(result, args[0])([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
std::vector<value_type> batch_max(batch_shape.elements(),
std::numeric_limits<value_type>::lowest());
std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
par_for(batch_shape.elements(), [&](auto i) {
auto idx = batch_shape.multi(i);
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[op.axis] = j;
batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[op.axis] = j;
std::size_t index = output_shape.index(idx);
output[index] = input[index] - batch_max[i];
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[op.axis] = j;
batch_sum[i] += std::exp(output(idx.begin(), idx.end()));
}
batch_sum[i] = std::log(batch_sum[i]);
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[op.axis] = j;
output(idx.begin(), idx.end()) -= batch_sum[i];
} }
}); });
}); });
...@@ -794,17 +663,18 @@ struct cpu_apply ...@@ -794,17 +663,18 @@ struct cpu_apply
{ {
apply_map["batch_norm_inference"] = apply_map["batch_norm_inference"] =
extend_op<cpu_batch_norm_inference, op::batch_norm_inference>(); extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>(); apply_map["convolution"] = extend_op<cpu_convolution<op::convolution>, op::convolution>();
apply_map["dot"] = extend_op<cpu_gemm, op::dot>(); apply_map["dot"] = extend_op<cpu_gemm, op::dot>();
apply_map["quant_dot"] = extend_op<cpu_quant_gemm, op::quant_dot>(); apply_map["quant_dot"] = extend_op<cpu_quant_gemm, op::quant_dot>();
apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>(); apply_map["quant_convolution"] =
apply_map["elu"] = extend_op<cpu_unary<elu_op>, op::elu>(); extend_op<cpu_convolution<op::quant_convolution>, op::quant_convolution>();
apply_map["im2col"] = extend_op<cpu_im2col, op::im2col>(); apply_map["elu"] = extend_op<cpu_unary<elu_op>, op::elu>();
apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>(); apply_map["im2col"] = extend_op<cpu_im2col, op::im2col>();
apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>(); apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
apply_map["lrn"] = extend_op<cpu_lrn, op::lrn>(); apply_map["logsoftmax"] = extend_op<cpu_softmax<op::logsoftmax>, op::logsoftmax>();
apply_map["pad"] = extend_op<cpu_pad, op::pad>(); apply_map["lrn"] = extend_op<cpu_lrn, op::lrn>();
apply_map["softmax"] = extend_op<cpu_softmax, op::softmax>(); apply_map["pad"] = extend_op<cpu_pad, op::pad>();
apply_map["softmax"] = extend_op<cpu_softmax<op::softmax>, op::softmax>();
} }
void apply() void apply()
......
...@@ -24,26 +24,6 @@ namespace gpu { ...@@ -24,26 +24,6 @@ namespace gpu {
struct context; struct context;
struct miopen_softmax
{
op::softmax op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "miopen::softmax"; }
shape compute_shape(const std::vector<shape>& inputs) const;
argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
struct hip_softmax struct hip_softmax
{ {
op::softmax op; op::softmax op;
......
...@@ -6,31 +6,6 @@ namespace migraphx { ...@@ -6,31 +6,6 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
shape miopen_softmax::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2).standard();
return op.compute_shape({inputs.at(0)});
}
argument miopen_softmax::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
float alpha = 1;
float beta = 0;
auto x_desc = make_tensor(args[0].get_shape());
auto y_desc = make_tensor(output_shape);
miopenSoftmaxForward(ctx.get_stream().get_miopen(),
&alpha,
x_desc.get(),
args[0].implicit(),
&beta,
y_desc.get(),
args[1].implicit());
return args[1];
}
shape hip_softmax::compute_shape(const std::vector<shape>& inputs) const shape hip_softmax::compute_shape(const std::vector<shape>& inputs) const
{ {
check_shapes{inputs, *this}.has(2).standard(); check_shapes{inputs, *this}.has(2).standard();
......
...@@ -1329,6 +1329,33 @@ TEST_CASE(quant_dot_3args_general) ...@@ -1329,6 +1329,33 @@ TEST_CASE(quant_dot_3args_general)
EXPECT(migraphx::verify_range(m, gold)); EXPECT(migraphx::verify_range(m, gold));
} }
{
migraphx::program p;
migraphx::shape m1_shape{migraphx::shape::int8_type, {3, 4}};
migraphx::shape m2_shape{migraphx::shape::int8_type, {4, 5}};
migraphx::shape m3_shape{migraphx::shape::int32_type, {3, 5}};
std::vector<int8_t> data1(3 * 4);
std::vector<int8_t> data2(4 * 5);
std::vector<int> data3(3 * 5);
std::iota(data1.begin(), data1.end(), 0);
std::iota(data2.begin(), data2.end(), 0);
std::iota(data3.begin(), data3.end(), 0);
auto l1 = p.add_literal(migraphx::literal{m1_shape, data1});
auto l2 = p.add_literal(migraphx::literal{m2_shape, data2});
auto l3 = p.add_literal(migraphx::literal{m3_shape, data3});
p.add_instruction(migraphx::op::quant_dot{1, 0}, l1, l2, l3);
std::vector<int> gold = {
70, 76, 82, 88, 94, 190, 212, 234, 256, 278, 310, 348, 386, 424, 462};
p.compile(migraphx::cpu::target{});
auto result = p.eval({});
std::vector<float> m;
result.visit([&](auto output) { m.assign(output.begin(), output.end()); });
EXPECT(migraphx::verify_range(m, gold));
}
{ {
migraphx::program p; migraphx::program p;
migraphx::shape m1_shape{migraphx::shape::int8_type, {8, 2}}; migraphx::shape m1_shape{migraphx::shape::int8_type, {8, 2}};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment