Commit e3950e2c authored by Shucai Xiao's avatar Shucai Xiao
Browse files

check in the initial GPU implementation of the int8 gemm.

parent ae21ecbf
...@@ -32,6 +32,7 @@ add_library(migraphx_device ...@@ -32,6 +32,7 @@ add_library(migraphx_device
device/pad.cpp device/pad.cpp
device/gather.cpp device/gather.cpp
device/sub.cpp device/sub.cpp
device/pack.cpp
) )
set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device) set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
rocm_clang_tidy_check(migraphx_device) rocm_clang_tidy_check(migraphx_device)
......
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/hip.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void pack_a(hipStream_t stream,
const argument& result, const argument& arg)
{
auto output_shape = result.get_shape();
auto dim_0 = output_shape.lens().size() - 2;
std::size_t ldb = output_shape.strides()[dim_0];
visit_all(result, arg) ([&](auto output, auto input) {
std::size_t nelements = output_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(output_shape.lens().size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(output_shape);
gs_launch(stream, nelements)([=](auto ii) {
const size_t nb = 4;
auto idx = desc.multi(ii);
std::size_t i_m = idx[0];
std::size_t i_k = idx[1];
out_ptr[i_k % nb + (i_m + (i_k / nb) * ldb) * nb] = in_ptr[i_m + i_k * ldb];
});
});
});
}
void pack_b(hipStream_t stream,
const argument& result, const argument& arg)
{
auto output_shape = result.get_shape();
auto dim_1 = output_shape.lens().size() - 1;
std::size_t lda = output_shape.strides()[dim_1];
visit_all(result, arg) ([&](auto output, auto input) {
std::size_t nelements = output_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(output_shape.lens().size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(output_shape);
gs_launch(stream, nelements)([=](auto ii) {
const size_t nb = 4;
auto idx = desc.multi(ii);
std::size_t i_n = idx[0];
std::size_t i_k = idx[1];
out_ptr[i_k % nb + (i_n + (i_k / nb) * lda) * nb] = in_ptr[i_n + i_k * lda];
});
});
});
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void pack_a(hipStream_t stream,
const argument& result, const argument& arg);
void pack_b(hipStream_t stream,
const argument& result, const argument& arg);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
...@@ -17,7 +17,16 @@ struct miopen_quant_gemm ...@@ -17,7 +17,16 @@ struct miopen_quant_gemm
shape compute_shape(const std::vector<shape>& inputs) const; shape compute_shape(const std::vector<shape>& inputs) const;
argument argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const; compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; } std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
};
struct hip_pack
{
std::string name() const { return "gpu::gemm_pack"; }
shape compute_shape(const std::vector<shape>& inputs) const;
argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
}; };
} // namespace gpu } // namespace gpu
......
...@@ -97,7 +97,6 @@ struct miopen_apply ...@@ -97,7 +97,6 @@ struct miopen_apply
add_generic_op<hip_min>("min"); add_generic_op<hip_min>("min");
add_extend_op<miopen_gemm, op::dot>("dot"); add_extend_op<miopen_gemm, op::dot>("dot");
add_extend_op<miopen_quant_gemm, op::quant_dot>("quant_dot");
add_extend_op<miopen_contiguous, op::contiguous>("contiguous"); add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
add_extend_op<hip_concat, op::concat>("concat"); add_extend_op<hip_concat, op::concat>("concat");
add_extend_op<miopen_softmax, op::softmax>("softmax"); add_extend_op<miopen_softmax, op::softmax>("softmax");
...@@ -110,6 +109,7 @@ struct miopen_apply ...@@ -110,6 +109,7 @@ struct miopen_apply
add_quant_convolution_op(); add_quant_convolution_op();
add_pooling_op(); add_pooling_op();
add_batch_norm_inference_op(); add_batch_norm_inference_op();
add_quant_gemm_op();
} }
void apply() void apply()
...@@ -263,6 +263,35 @@ struct miopen_apply ...@@ -263,6 +263,35 @@ struct miopen_apply
output); output);
}); });
} }
void add_quant_gemm_op()
{
apply_map.emplace("quant_gemm", [=](instruction_ref ins) {
auto&& op = any_cast<op::quant_dot>(ins->get_operator());
auto output = insert_allocation(ins, ins->get_shape());
std::vector<instruction_ref> refs = ins->inputs();
refs.push_back(output);
// Need another two buffers for packed data buffer
auto shape_a = refs.at(0)->get_shape();
if (shape_a.transposed())
{
auto pack_a = insert_allocation(ins, shape_a);
refs.push_back(pack_a);
std::swap(refs.back(), refs.at(0));
}
auto shape_b = refs.at(1)->get_shape();
if (!shape_b.transposed())
{
auto pack_b = insert_allocation(ins, shape_b);
refs.push_back(pack_b);
std::swap(refs.back(), refs.at(1));
}
return prog->replace_instruction(ins, miopen_quant_gemm{op}, refs);
});
}
}; };
void lowering::apply(program& p) const { miopen_apply{&p, ctx}.apply(); } void lowering::apply(program& p) const { miopen_apply{&p, ctx}.apply(); }
......
#include <migraphx/gpu/quant_gemm.hpp> #include <migraphx/gpu/quant_gemm.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/context.hpp> #include <migraphx/gpu/context.hpp>
namespace migraphx { namespace migraphx {
...@@ -61,7 +62,36 @@ argument miopen_quant_gemm::compute(context& ctx, ...@@ -61,7 +62,36 @@ argument miopen_quant_gemm::compute(context& ctx,
const shape& output_shape, const shape& output_shape,
const std::vector<argument>& args) const const std::vector<argument>& args) const
{ {
bool is_3inputs = (args.size() == 4); // handling the packing of B MUST be before handling that for A
bool transa = args[0].get_shape().transposed();
bool transb = args[1].get_shape().transposed();
auto n_dim = output_shape.lens().size();
auto dim_1 = n_dim - 1;
auto dim_0 = n_dim - 2;
rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
rocblas_int ldc = args[2].get_shape().strides()[dim_0];
size_t addi_ref_num = 0;
if(!transb)
{
++addi_ref_num;
const argument& arg_b = args[args.size() - 1];
// argument for B is the last one in the input argument vector
// use the algorithm to pack A
device::pack_a(ctx.get_stream().get(), args[1], arg_b);
}
// need to pack A in this scenario, use the algorithm to pack B in the
// comment of the API
if(transa)
{
++addi_ref_num;
const argument& arg_a = args[args.size() - 1 - addi_ref_num];
device::pack_b(ctx.get_stream().get(), args[0], arg_a);
}
bool is_3inputs = (args.size() - addi_ref_num == 4);
int8_t beta = 0; int8_t beta = 0;
if(is_3inputs) if(is_3inputs)
{ {
...@@ -71,42 +101,14 @@ argument miopen_quant_gemm::compute(context& ctx, ...@@ -71,42 +101,14 @@ argument miopen_quant_gemm::compute(context& ctx,
auto a_lens = args[0].get_shape().lens(); auto a_lens = args[0].get_shape().lens();
auto b_lens = args[1].get_shape().lens(); auto b_lens = args[1].get_shape().lens();
output_shape.visit_type([&](auto as) { output_shape.visit_type([&](auto as) {
auto n_dim = output_shape.lens().size();
auto dim_1 = n_dim - 1;
auto dim_0 = n_dim - 2;
auto alpha_r = to_rocblas_type(as(op.alpha)); auto alpha_r = to_rocblas_type(as(op.alpha));
auto beta_r = to_rocblas_type(as(beta)); auto beta_r = to_rocblas_type(as(beta));
bool transa = args[0].get_shape().transposed();
bool transb = args[1].get_shape().transposed();
rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
rocblas_int ldc = args[2].get_shape().strides()[dim_0];
auto out_lens = output_shape.lens(); auto out_lens = output_shape.lens();
rocblas_int m = out_lens[dim_0]; rocblas_int m = out_lens[dim_0];
rocblas_int n = out_lens[dim_1]; rocblas_int n = out_lens[dim_1];
rocblas_int k = args[0].get_shape().lens()[dim_1]; rocblas_int k = args[0].get_shape().lens()[dim_1];
auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); }; auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
assert(k % 4 == 0); assert(k % 4 == 0);
assert(!transa or (lda % 4 == 0));
assert(transb or (ldb % 4 == 0));
// need to pack B in thi scenario
if(!transb)
{
int nb = 4;
for(int i_m = 0; i_m < m; i_m++)
{
for(int i_k = 0; i_k < k; i_k++)
{
A_packed[i_k % nb + (i_m + (i_k / nb) * lda) * nb] = A[i_m + i_k * lda];
}
}
}
// need to pack A in this scenario
if(transa)
{
}
auto num_matrices = std::accumulate( auto num_matrices = std::accumulate(
out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()); out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment