Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
8e824ed1
Commit
8e824ed1
authored
May 10, 2019
by
Shucai Xiao
Browse files
fix bugs in gpu quant_dot implementation
parent
433f854a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
33 additions
and
15 deletions
+33
-15
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
...targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+7
-5
src/targets/gpu/device/pack.cpp
src/targets/gpu/device/pack.cpp
+2
-2
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+19
-4
src/targets/gpu/quant_gemm.cpp
src/targets/gpu/quant_gemm.cpp
+5
-4
No files found.
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
View file @
8e824ed1
...
@@ -61,11 +61,12 @@ struct hip_tensor_descriptor
...
@@ -61,11 +61,12 @@ struct hip_tensor_descriptor
{
{
std
::
copy
(
s
.
lens
().
begin
(),
s
.
lens
().
end
(),
lens
);
std
::
copy
(
s
.
lens
().
begin
(),
s
.
lens
().
end
(),
lens
);
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
);
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
);
indices
.
resize
(
s
.
stride
s
().
size
());
std
::
vector
<
std
::
size_t
>
vec_idx
(
s
.
len
s
().
size
());
std
::
iota
(
indices
.
begin
(),
indices
.
end
(),
0
);
std
::
iota
(
vec_idx
.
begin
(),
vec_idx
.
end
(),
0
);
std
::
sort
(
indices
.
begin
(),
indices
.
end
(),
[
&
](
size_t
i
1
,
size_t
i2
)
{
std
::
sort
(
vec_idx
.
begin
(),
vec_idx
.
end
(),
[
&
](
size_t
i
,
size_t
j
)
{
return
strides
[
i
1
]
>
strides
[
i2
];
return
strides
[
i
]
>
strides
[
j
];
});
});
std
::
copy
(
vec_idx
.
begin
(),
vec_idx
.
end
(),
indices
);
}
}
__device__
__host__
hip_index
<
NDim
>
multi
(
size_t
idx
)
const
__device__
__host__
hip_index
<
NDim
>
multi
(
size_t
idx
)
const
...
@@ -79,6 +80,7 @@ struct hip_tensor_descriptor
...
@@ -79,6 +80,7 @@ struct hip_tensor_descriptor
}
}
return
result
;
return
result
;
}
}
__device__
__host__
size_t
linear
(
hip_index
<
NDim
>
s
)
const
__device__
__host__
size_t
linear
(
hip_index
<
NDim
>
s
)
const
{
{
size_t
idx
=
0
;
size_t
idx
=
0
;
...
@@ -88,7 +90,7 @@ struct hip_tensor_descriptor
...
@@ -88,7 +90,7 @@ struct hip_tensor_descriptor
}
}
size_t
lens
[
NDim
]
=
{};
size_t
lens
[
NDim
]
=
{};
size_t
strides
[
NDim
]
=
{};
size_t
strides
[
NDim
]
=
{};
std
::
vector
<
size_t
>
indices
{};
size_t
indices
[
NDim
]
=
{};
};
};
}
// namespace device
}
// namespace device
...
...
src/targets/gpu/device/pack.cpp
View file @
8e824ed1
...
@@ -55,8 +55,8 @@ void pack_b(hipStream_t stream, const argument& result, const argument& arg)
...
@@ -55,8 +55,8 @@ void pack_b(hipStream_t stream, const argument& result, const argument& arg)
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_n
=
idx
[
1
];
std
::
size_t
i_n
=
idx
[
dim_0
];
std
::
size_t
i_k
=
idx
[
0
];
std
::
size_t
i_k
=
idx
[
dim_1
];
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
ldb
)
*
nb
+
offset
]
=
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
ldb
)
*
nb
+
offset
]
=
in_ptr
[
i_n
+
i_k
*
ldb
+
offset
];
in_ptr
[
i_n
+
i_k
*
ldb
+
offset
];
...
...
src/targets/gpu/lowering.cpp
View file @
8e824ed1
...
@@ -178,11 +178,26 @@ struct miopen_apply
...
@@ -178,11 +178,26 @@ struct miopen_apply
auto
&&
op
=
any_cast
<
op
::
quant_dot
>
(
ins
->
get_operator
());
auto
&&
op
=
any_cast
<
op
::
quant_dot
>
(
ins
->
get_operator
());
std
::
vector
<
instruction_ref
>
refs
=
ins
->
inputs
();
std
::
vector
<
instruction_ref
>
refs
=
ins
->
inputs
();
// add additional arguments if need packing
// add additional arguments if need packing. Since lowering is added
if
(
refs
.
at
(
0
)
->
get_shape
().
transposed
())
// after auto_contiguous and before eliminate contiguous, the shapes
// of all inputs are standard, so the input shape cannot be transposed.
// To avoid that, we need to check whether this argument is an output
// of contiguous. If true, we should check the shape of the input
// of the contiguous operator.
auto
prev_ins
=
refs
.
at
(
0
);
if
(
prev_ins
->
name
()
==
"gpu::contiguous"
)
{
{
auto
pack_a
=
insert_allocation
(
refs
.
at
(
0
),
refs
.
at
(
0
)
->
get_shape
());
auto
input
=
prev_ins
->
inputs
().
front
();
refs
.
push_back
(
pack_a
);
if
(
input
->
get_shape
().
transposed
())
{
auto
pack_a
=
insert_allocation
(
input
,
input
->
get_shape
());
// replace one of the inputs of quant_gemm from the output to the
// input of contiguous. Then the contiguous could become dead code
// of prev_ins is its only output
refs
.
at
(
0
)
=
input
;
instruction
::
replace_argument
(
ins
,
prev_ins
,
input
);
refs
.
push_back
(
pack_a
);
}
}
}
if
(
!
refs
.
at
(
1
)
->
get_shape
().
transposed
())
if
(
!
refs
.
at
(
1
)
->
get_shape
().
transposed
())
...
...
src/targets/gpu/quant_gemm.cpp
View file @
8e824ed1
...
@@ -75,6 +75,7 @@ argument miopen_quant_gemm::compute(context& ctx,
...
@@ -75,6 +75,7 @@ argument miopen_quant_gemm::compute(context& ctx,
const
std
::
vector
<
argument
>&
args
)
const
const
std
::
vector
<
argument
>&
args
)
const
{
{
// handling the packing of B MUST be before handling that for A
// handling the packing of B MUST be before handling that for A
auto
arg_res
=
args
.
back
();
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transa
=
args
[
0
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
bool
transb
=
args
[
1
].
get_shape
().
transposed
();
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
n_dim
=
output_shape
.
lens
().
size
();
...
@@ -82,7 +83,7 @@ argument miopen_quant_gemm::compute(context& ctx,
...
@@ -82,7 +83,7 @@ argument miopen_quant_gemm::compute(context& ctx,
auto
dim_0
=
n_dim
-
2
;
auto
dim_0
=
n_dim
-
2
;
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
arg
s
[
2
]
.
get_shape
().
strides
()[
dim_0
];
rocblas_int
ldc
=
arg
_res
.
get_shape
().
strides
()[
dim_0
];
auto
arg_b
=
args
.
at
(
1
);
auto
arg_b
=
args
.
at
(
1
);
std
::
size_t
pack_arg_num
=
0
;
std
::
size_t
pack_arg_num
=
0
;
...
@@ -147,7 +148,7 @@ argument miopen_quant_gemm::compute(context& ctx,
...
@@ -147,7 +148,7 @@ argument miopen_quant_gemm::compute(context& ctx,
to_pointer
(
args
[
2
]),
to_pointer
(
args
[
2
]),
rocblas_datatype_i32_r
,
rocblas_datatype_i32_r
,
ldc
,
ldc
,
(
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
])
),
to_pointer
(
arg_res
),
rocblas_datatype_i32_r
,
rocblas_datatype_i32_r
,
ldc
,
ldc
,
rocblas_datatype_i32_r
,
rocblas_datatype_i32_r
,
...
@@ -180,7 +181,7 @@ argument miopen_quant_gemm::compute(context& ctx,
...
@@ -180,7 +181,7 @@ argument miopen_quant_gemm::compute(context& ctx,
rocblas_datatype_i32_r
,
rocblas_datatype_i32_r
,
ldc
,
ldc
,
m
*
n
,
m
*
n
,
(
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
])
),
to_pointer
(
arg_res
),
rocblas_datatype_i32_r
,
rocblas_datatype_i32_r
,
ldc
,
ldc
,
m
*
n
,
m
*
n
,
...
@@ -194,7 +195,7 @@ argument miopen_quant_gemm::compute(context& ctx,
...
@@ -194,7 +195,7 @@ argument miopen_quant_gemm::compute(context& ctx,
}
}
});
});
return
(
is_3inputs
?
args
[
3
]
:
args
[
2
])
;
return
arg_res
;
}
}
}
// namespace gpu
}
// namespace gpu
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment