Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
e3950e2c
Commit
e3950e2c
authored
May 06, 2019
by
Shucai Xiao
Browse files
check in the initial GPU implementation of the int8 gemm.
parent
ae21ecbf
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
158 additions
and
31 deletions
+158
-31
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+1
-0
src/targets/gpu/device/pack.cpp
src/targets/gpu/device/pack.cpp
+63
-0
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
+23
-0
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+10
-1
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+30
-1
src/targets/gpu/quant_gemm.cpp
src/targets/gpu/quant_gemm.cpp
+31
-29
No files found.
src/targets/gpu/CMakeLists.txt
View file @
e3950e2c
...
...
@@ -32,6 +32,7 @@ add_library(migraphx_device
device/pad.cpp
device/gather.cpp
device/sub.cpp
device/pack.cpp
)
set_target_properties
(
migraphx_device PROPERTIES EXPORT_NAME device
)
rocm_clang_tidy_check
(
migraphx_device
)
...
...
src/targets/gpu/device/pack.cpp
0 → 100644
View file @
e3950e2c
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/hip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
output_shape
=
result
.
get_shape
();
auto
dim_0
=
output_shape
.
lens
().
size
()
-
2
;
std
::
size_t
ldb
=
output_shape
.
strides
()[
dim_0
];
visit_all
(
result
,
arg
)
([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
output_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_m
=
idx
[
0
];
std
::
size_t
i_k
=
idx
[
1
];
out_ptr
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
ldb
)
*
nb
]
=
in_ptr
[
i_m
+
i_k
*
ldb
];
});
});
});
}
void
pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
output_shape
=
result
.
get_shape
();
auto
dim_1
=
output_shape
.
lens
().
size
()
-
1
;
std
::
size_t
lda
=
output_shape
.
strides
()[
dim_1
];
visit_all
(
result
,
arg
)
([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
output_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_n
=
idx
[
0
];
std
::
size_t
i_k
=
idx
[
1
];
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
lda
)
*
nb
]
=
in_ptr
[
i_n
+
i_k
*
lda
];
});
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
0 → 100644
View file @
e3950e2c
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
void
pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
View file @
e3950e2c
...
...
@@ -17,7 +17,16 @@ struct miopen_quant_gemm
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
;
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
int
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
};
struct
hip_pack
{
std
::
string
name
()
const
{
return
"gpu::gemm_pack"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
;
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
};
}
// namespace gpu
...
...
src/targets/gpu/lowering.cpp
View file @
e3950e2c
...
...
@@ -97,7 +97,6 @@ struct miopen_apply
add_generic_op
<
hip_min
>
(
"min"
);
add_extend_op
<
miopen_gemm
,
op
::
dot
>
(
"dot"
);
add_extend_op
<
miopen_quant_gemm
,
op
::
quant_dot
>
(
"quant_dot"
);
add_extend_op
<
miopen_contiguous
,
op
::
contiguous
>
(
"contiguous"
);
add_extend_op
<
hip_concat
,
op
::
concat
>
(
"concat"
);
add_extend_op
<
miopen_softmax
,
op
::
softmax
>
(
"softmax"
);
...
...
@@ -110,6 +109,7 @@ struct miopen_apply
add_quant_convolution_op
();
add_pooling_op
();
add_batch_norm_inference_op
();
add_quant_gemm_op
();
}
void
apply
()
...
...
@@ -263,6 +263,35 @@ struct miopen_apply
output
);
});
}
void
add_quant_gemm_op
()
{
apply_map
.
emplace
(
"quant_gemm"
,
[
=
](
instruction_ref
ins
)
{
auto
&&
op
=
any_cast
<
op
::
quant_dot
>
(
ins
->
get_operator
());
auto
output
=
insert_allocation
(
ins
,
ins
->
get_shape
());
std
::
vector
<
instruction_ref
>
refs
=
ins
->
inputs
();
refs
.
push_back
(
output
);
// Need another two buffers for packed data buffer
auto
shape_a
=
refs
.
at
(
0
)
->
get_shape
();
if
(
shape_a
.
transposed
())
{
auto
pack_a
=
insert_allocation
(
ins
,
shape_a
);
refs
.
push_back
(
pack_a
);
std
::
swap
(
refs
.
back
(),
refs
.
at
(
0
));
}
auto
shape_b
=
refs
.
at
(
1
)
->
get_shape
();
if
(
!
shape_b
.
transposed
())
{
auto
pack_b
=
insert_allocation
(
ins
,
shape_b
);
refs
.
push_back
(
pack_b
);
std
::
swap
(
refs
.
back
(),
refs
.
at
(
1
));
}
return
prog
->
replace_instruction
(
ins
,
miopen_quant_gemm
{
op
},
refs
);
});
}
};
void
lowering
::
apply
(
program
&
p
)
const
{
miopen_apply
{
&
p
,
ctx
}.
apply
();
}
...
...
src/targets/gpu/quant_gemm.cpp
View file @
e3950e2c
#include <migraphx/gpu/quant_gemm.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
...
...
@@ -61,7 +62,36 @@ argument miopen_quant_gemm::compute(context& ctx,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
bool
is_3inputs
=
(
args
.
size
()
==
4
);
// handling the packing of B MUST be before handling that for A
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
size_t
addi_ref_num
=
0
;
if
(
!
transb
)
{
++
addi_ref_num
;
const
argument
&
arg_b
=
args
[
args
.
size
()
-
1
];
// argument for B is the last one in the input argument vector
// use the algorithm to pack A
device
::
pack_a
(
ctx
.
get_stream
().
get
(),
args
[
1
],
arg_b
);
}
// need to pack A in this scenario, use the algorithm to pack B in the
// comment of the API
if
(
transa
)
{
++
addi_ref_num
;
const
argument
&
arg_a
=
args
[
args
.
size
()
-
1
-
addi_ref_num
];
device
::
pack_b
(
ctx
.
get_stream
().
get
(),
args
[
0
],
arg_a
);
}
bool
is_3inputs
=
(
args
.
size
()
-
addi_ref_num
==
4
);
int8_t
beta
=
0
;
if
(
is_3inputs
)
{
...
...
@@ -71,42 +101,14 @@ argument miopen_quant_gemm::compute(context& ctx,
auto
a_lens
=
args
[
0
].
get_shape
().
lens
();
auto
b_lens
=
args
[
1
].
get_shape
().
lens
();
output_shape
.
visit_type
([
&
](
auto
as
)
{
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
auto
alpha_r
=
to_rocblas_type
(
as
(
op
.
alpha
));
auto
beta_r
=
to_rocblas_type
(
as
(
beta
));
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
auto
out_lens
=
output_shape
.
lens
();
rocblas_int
m
=
out_lens
[
dim_0
];
rocblas_int
n
=
out_lens
[
dim_1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
dim_1
];
auto
to_pointer
=
[
&
](
auto
&&
arg
)
{
return
to_rocblas_type
(
as
.
from
(
arg
.
data
()));
};
assert
(
k
%
4
==
0
);
assert
(
!
transa
or
(
lda
%
4
==
0
));
assert
(
transb
or
(
ldb
%
4
==
0
));
// need to pack B in thi scenario
if
(
!
transb
)
{
int
nb
=
4
;
for
(
int
i_m
=
0
;
i_m
<
m
;
i_m
++
)
{
for
(
int
i_k
=
0
;
i_k
<
k
;
i_k
++
)
{
A_packed
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
lda
)
*
nb
]
=
A
[
i_m
+
i_k
*
lda
];
}
}
}
// need to pack A in this scenario
if
(
transa
)
{
}
auto
num_matrices
=
std
::
accumulate
(
out_lens
.
rbegin
()
+
2
,
out_lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment