Commit 4ffaf3c7 authored by Shucai Xiao's avatar Shucai Xiao
Browse files

merge changes to solve the slowndown of gpu::gemm

parents 770c7d27 afbfbdc0
...@@ -127,26 +127,48 @@ argument miopen_gemm::compute(context& ctx, ...@@ -127,26 +127,48 @@ argument miopen_gemm::compute(context& ctx,
auto alpha_r = to_rocblas_type(as(op.alpha)); auto alpha_r = to_rocblas_type(as(op.alpha));
auto beta_r = to_rocblas_type(as(op.beta)); auto beta_r = to_rocblas_type(as(op.beta));
auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); }; auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
generic_rocblas_batched_gemm(as, // call the strided implementation only if there are multiple matrices
ctx.get_stream().get_rocblas(), if(batch_num > 1)
transb ? rocblas_operation_transpose : rocblas_operation_none, {
transa ? rocblas_operation_transpose : rocblas_operation_none, generic_rocblas_batched_gemm(
n, as,
m, ctx.get_stream().get_rocblas(),
k, transb ? rocblas_operation_transpose : rocblas_operation_none,
&alpha_r, transa ? rocblas_operation_transpose : rocblas_operation_none,
to_pointer(args[1]), n,
ldb, m,
k * n, k,
to_pointer(args[0]), &alpha_r,
lda, to_pointer(args[1]),
m * k, ldb,
&beta_r, k * n,
is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]), to_pointer(args[0]),
ldc, lda,
m * n, m * k,
batch_num); &beta_r,
to_pointer(args[2]),
ldc,
m * n,
batch_num);
}
else
{
generic_rocblas_gemm(as,
ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none,
transa ? rocblas_operation_transpose : rocblas_operation_none,
n,
m,
k,
&alpha_r,
to_pointer(args[1]),
ldb,
to_pointer(args[0]),
lda,
&beta_r,
to_pointer(args[2]),
ldc);
}
}); });
return (is_3inputs ? args[3] : args[2]); return (is_3inputs ? args[3] : args[2]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment