Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
e3950e2c
"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "3a25eb20f192accb1cacc3eba5b44581b3c884ad"
Commit
e3950e2c
authored
May 06, 2019
by
Shucai Xiao
Browse files
check in the initial GPU implementation of the int8 gemm.
parent
ae21ecbf
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
158 additions
and
31 deletions
+158
-31
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+1
-0
src/targets/gpu/device/pack.cpp
src/targets/gpu/device/pack.cpp
+63
-0
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
+23
-0
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+10
-1
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+30
-1
src/targets/gpu/quant_gemm.cpp
src/targets/gpu/quant_gemm.cpp
+31
-29
No files found.
src/targets/gpu/CMakeLists.txt
View file @
e3950e2c
...
...
@@ -32,6 +32,7 @@ add_library(migraphx_device
device/pad.cpp
device/gather.cpp
device/sub.cpp
device/pack.cpp
)
set_target_properties
(
migraphx_device PROPERTIES EXPORT_NAME device
)
rocm_clang_tidy_check
(
migraphx_device
)
...
...
src/targets/gpu/device/pack.cpp
0 → 100644
View file @
e3950e2c
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/hip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
output_shape
=
result
.
get_shape
();
auto
dim_0
=
output_shape
.
lens
().
size
()
-
2
;
std
::
size_t
ldb
=
output_shape
.
strides
()[
dim_0
];
visit_all
(
result
,
arg
)
([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
output_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_m
=
idx
[
0
];
std
::
size_t
i_k
=
idx
[
1
];
out_ptr
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
ldb
)
*
nb
]
=
in_ptr
[
i_m
+
i_k
*
ldb
];
});
});
});
}
void
pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
output_shape
=
result
.
get_shape
();
auto
dim_1
=
output_shape
.
lens
().
size
()
-
1
;
std
::
size_t
lda
=
output_shape
.
strides
()[
dim_1
];
visit_all
(
result
,
arg
)
([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
output_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_n
=
idx
[
0
];
std
::
size_t
i_k
=
idx
[
1
];
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
lda
)
*
nb
]
=
in_ptr
[
i_n
+
i_k
*
lda
];
});
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/include/migraphx/gpu/device/pack.hpp
0 → 100644
View file @
e3950e2c
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
void
pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
View file @
e3950e2c
...
...
@@ -17,7 +17,16 @@ struct miopen_quant_gemm
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
;
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
int
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
};
struct
hip_pack
{
std
::
string
name
()
const
{
return
"gpu::gemm_pack"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
;
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
}
};
}
// namespace gpu
...
...
src/targets/gpu/lowering.cpp
View file @
e3950e2c
...
...
@@ -97,7 +97,6 @@ struct miopen_apply
add_generic_op
<
hip_min
>
(
"min"
);
add_extend_op
<
miopen_gemm
,
op
::
dot
>
(
"dot"
);
add_extend_op
<
miopen_quant_gemm
,
op
::
quant_dot
>
(
"quant_dot"
);
add_extend_op
<
miopen_contiguous
,
op
::
contiguous
>
(
"contiguous"
);
add_extend_op
<
hip_concat
,
op
::
concat
>
(
"concat"
);
add_extend_op
<
miopen_softmax
,
op
::
softmax
>
(
"softmax"
);
...
...
@@ -110,6 +109,7 @@ struct miopen_apply
add_quant_convolution_op
();
add_pooling_op
();
add_batch_norm_inference_op
();
add_quant_gemm_op
();
}
void
apply
()
...
...
@@ -263,6 +263,35 @@ struct miopen_apply
output
);
});
}
void
add_quant_gemm_op
()
{
apply_map
.
emplace
(
"quant_gemm"
,
[
=
](
instruction_ref
ins
)
{
auto
&&
op
=
any_cast
<
op
::
quant_dot
>
(
ins
->
get_operator
());
auto
output
=
insert_allocation
(
ins
,
ins
->
get_shape
());
std
::
vector
<
instruction_ref
>
refs
=
ins
->
inputs
();
refs
.
push_back
(
output
);
// Need another two buffers for packed data buffer
auto
shape_a
=
refs
.
at
(
0
)
->
get_shape
();
if
(
shape_a
.
transposed
())
{
auto
pack_a
=
insert_allocation
(
ins
,
shape_a
);
refs
.
push_back
(
pack_a
);
std
::
swap
(
refs
.
back
(),
refs
.
at
(
0
));
}
auto
shape_b
=
refs
.
at
(
1
)
->
get_shape
();
if
(
!
shape_b
.
transposed
())
{
auto
pack_b
=
insert_allocation
(
ins
,
shape_b
);
refs
.
push_back
(
pack_b
);
std
::
swap
(
refs
.
back
(),
refs
.
at
(
1
));
}
return
prog
->
replace_instruction
(
ins
,
miopen_quant_gemm
{
op
},
refs
);
});
}
};
void
lowering
::
apply
(
program
&
p
)
const
{
miopen_apply
{
&
p
,
ctx
}.
apply
();
}
...
...
src/targets/gpu/quant_gemm.cpp
View file @
e3950e2c
#include <migraphx/gpu/quant_gemm.hpp>
#include <migraphx/gpu/device/pack.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
...
...
@@ -61,7 +62,36 @@ argument miopen_quant_gemm::compute(context& ctx,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
bool
is_3inputs
=
(
args
.
size
()
==
4
);
// handling the packing of B MUST be before handling that for A
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
size_t
addi_ref_num
=
0
;
if
(
!
transb
)
{
++
addi_ref_num
;
const
argument
&
arg_b
=
args
[
args
.
size
()
-
1
];
// argument for B is the last one in the input argument vector
// use the algorithm to pack A
device
::
pack_a
(
ctx
.
get_stream
().
get
(),
args
[
1
],
arg_b
);
}
// need to pack A in this scenario, use the algorithm to pack B in the
// comment of the API
if
(
transa
)
{
++
addi_ref_num
;
const
argument
&
arg_a
=
args
[
args
.
size
()
-
1
-
addi_ref_num
];
device
::
pack_b
(
ctx
.
get_stream
().
get
(),
args
[
0
],
arg_a
);
}
bool
is_3inputs
=
(
args
.
size
()
-
addi_ref_num
==
4
);
int8_t
beta
=
0
;
if
(
is_3inputs
)
{
...
...
@@ -71,42 +101,14 @@ argument miopen_quant_gemm::compute(context& ctx,
auto
a_lens
=
args
[
0
].
get_shape
().
lens
();
auto
b_lens
=
args
[
1
].
get_shape
().
lens
();
output_shape
.
visit_type
([
&
](
auto
as
)
{
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
auto
alpha_r
=
to_rocblas_type
(
as
(
op
.
alpha
));
auto
beta_r
=
to_rocblas_type
(
as
(
beta
));
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
auto
out_lens
=
output_shape
.
lens
();
rocblas_int
m
=
out_lens
[
dim_0
];
rocblas_int
n
=
out_lens
[
dim_1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
dim_1
];
auto
to_pointer
=
[
&
](
auto
&&
arg
)
{
return
to_rocblas_type
(
as
.
from
(
arg
.
data
()));
};
assert
(
k
%
4
==
0
);
assert
(
!
transa
or
(
lda
%
4
==
0
));
assert
(
transb
or
(
ldb
%
4
==
0
));
// need to pack B in thi scenario
if
(
!
transb
)
{
int
nb
=
4
;
for
(
int
i_m
=
0
;
i_m
<
m
;
i_m
++
)
{
for
(
int
i_k
=
0
;
i_k
<
k
;
i_k
++
)
{
A_packed
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
lda
)
*
nb
]
=
A
[
i_m
+
i_k
*
lda
];
}
}
}
// need to pack A in this scenario
if
(
transa
)
{
}
auto
num_matrices
=
std
::
accumulate
(
out_lens
.
rbegin
()
+
2
,
out_lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment