Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
23cb7917
Unverified
Commit
23cb7917
authored
Aug 16, 2023
by
Brian Pickrell
Committed by
GitHub
Aug 16, 2023
Browse files
Merge branch 'develop' into blas_tuning
parents
b5fcc0bc
ea32ca70
Changes
458
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
429 additions
and
71 deletions
+429
-71
src/targets/gpu/jit/gathernd.cpp
src/targets/gpu/jit/gathernd.cpp
+2
-2
src/targets/gpu/jit/layernorm.cpp
src/targets/gpu/jit/layernorm.cpp
+2
-2
src/targets/gpu/jit/mlir.cpp
src/targets/gpu/jit/mlir.cpp
+19
-6
src/targets/gpu/jit/pad.cpp
src/targets/gpu/jit/pad.cpp
+2
-2
src/targets/gpu/jit/pointwise.cpp
src/targets/gpu/jit/pointwise.cpp
+7
-8
src/targets/gpu/jit/reduce.cpp
src/targets/gpu/jit/reduce.cpp
+6
-6
src/targets/gpu/jit/roialign.cpp
src/targets/gpu/jit/roialign.cpp
+2
-2
src/targets/gpu/jit/scatternd.cpp
src/targets/gpu/jit/scatternd.cpp
+9
-9
src/targets/gpu/jit/softmax.cpp
src/targets/gpu/jit/softmax.cpp
+2
-2
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+12
-0
src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
+164
-0
src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+72
-0
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+4
-2
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
...rgets/gpu/kernels/include/migraphx/kernels/functional.hpp
+11
-2
src/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
...ets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
+92
-0
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+0
-4
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+8
-0
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+5
-15
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+2
-2
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+8
-7
No files found.
src/targets/gpu/jit/gathernd.cpp
View file @
23cb7917
...
...
@@ -44,7 +44,7 @@ namespace migraphx {
extern "C" {
__global__
void gathernd_kernel(void* in_data, void* in_indices, void* output)
MIGRAPHX_GLOBAL
void gathernd_kernel(void* in_data, void* in_indices, void* output)
{
make_tensors()(in_data, in_indices, output)([](auto&&... xs) {
auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));
...
...
@@ -82,7 +82,7 @@ struct gathernd_compiler : compiler<gathernd_compiler>
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
())
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
());
}
};
...
...
src/targets/gpu/jit/layernorm.cpp
View file @
23cb7917
...
...
@@ -48,7 +48,7 @@ namespace migraphx {
${preamble}
extern "C" {
__global__
void ${kernel}(${params})
MIGRAPHX_GLOBAL
void ${kernel}(${params})
{
transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto... xs) {
${layernorm}<${axis}>(${post}, ${eps}, xs...);
...
...
@@ -122,7 +122,7 @@ struct layernorm_compiler : compiler<layernorm_compiler>
v
[
"kernel"
]
=
v
[
"layernorm"
].
to
<
std
::
string
>
()
+
"_"
+
generate_name_from_ops
(
*
pm
)
+
"_kernel"
;
}
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
)
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
);
}
};
...
...
src/targets/gpu/jit/mlir.cpp
View file @
23cb7917
...
...
@@ -36,19 +36,32 @@ struct mlir_compiler : compiler<mlir_compiler>
operation
compile_op
(
context
&
,
const
std
::
vector
<
shape
>&
,
const
value
&
)
const
{
return
{};
}
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
)
const
compiler_replace
compile
(
const
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
,
const
value
&
solution
)
const
{
auto
*
smod
=
ins
->
module_inputs
().
front
();
assert
(
smod
->
get_parameter_names
().
size
()
==
ins
->
inputs
().
size
()
-
1
);
return
insert
(
compile_mlir
(
ctx
,
*
smod
,
ins
->
inputs
()));
return
insert
(
compile_mlir
(
ctx
,
*
smod
,
ins
->
inputs
()
,
solution
));
}
compiler_replace
insert
(
code_object_op
co
)
const
{
return
[
co
=
std
::
move
(
co
)](
module
&
m
,
instruction_ref
ins
)
{
auto
mlir
=
insert_mlir
(
m
,
ins
,
co
,
ins
->
inputs
());
m
.
replace_instruction
(
ins
,
mlir
);
};
return
{
std
::
move
(
co
),
[](
module
&
m
,
instruction_ref
ins
,
const
operation
&
op
)
{
auto
mlir
=
insert_mlir
(
m
,
ins
,
any_cast
<
code_object_op
>
(
op
),
ins
->
inputs
());
m
.
replace_instruction
(
ins
,
mlir
);
}};
}
optional
<
tuning_config
>
get_tuning_config
(
const
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
,
bool
exhaustive
)
const
{
if
(
not
exhaustive
)
return
nullopt
;
auto
shapes
=
to_shapes
(
ins
->
inputs
());
auto
*
smod
=
ins
->
module_inputs
().
front
();
return
get_tuning_config_mlir
(
ctx
,
*
smod
,
shapes
);
}
};
...
...
src/targets/gpu/jit/pad.cpp
View file @
23cb7917
...
...
@@ -44,7 +44,7 @@ static const char* const pointwise_kernel = R"__migraphx__(
namespace migraphx {
extern "C" {
__global__
void pad_kernel(void* input_p, void* output_p)
MIGRAPHX_GLOBAL
void pad_kernel(void* input_p, void* output_p)
{
auto offsets = index_ints<${offsets}>{};
auto idx = make_index();
...
...
@@ -92,7 +92,7 @@ struct pad_compiler : compiler<pad_compiler>
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
())
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
());
}
};
}
// namespace gpu
...
...
src/targets/gpu/jit/pointwise.cpp
View file @
23cb7917
...
...
@@ -44,7 +44,7 @@ namespace migraphx {
${preamble}
extern "C" {
__global__
void ${kernel}(${params})
MIGRAPHX_GLOBAL
void ${kernel}(${params})
{
auto idx = make_index();
pointwise(idx, ${transformers})(${lambda}, ${args});
...
...
@@ -72,7 +72,7 @@ struct pointwise_compiler : compiler<pointwise_compiler>
hip_compile_options
options
;
options
.
inputs
=
inputs
;
options
.
output
=
inputs
.
back
();
options
.
virtual_inputs
=
reduce_dims
(
inputs
);
options
.
virtual_inputs
=
reduce_dims
(
normalize_permutation
(
inputs
)
)
;
options
.
params
=
"-Wno-float-equal"
;
auto
axis
=
find_fast_axis
(
options
.
virtual_inputs
);
auto
vec
=
vectorize
::
elements
(
ctx
,
axis
,
options
.
virtual_inputs
);
...
...
@@ -93,10 +93,10 @@ struct pointwise_compiler : compiler<pointwise_compiler>
{
if
(
contains
({
"layout"
,
"contiguous"
},
op
.
name
()))
{
return
replace
(
compile_op
(
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
{{
"lambda"
,
"[](auto x) { return x; }"
},
{
"kernel"
,
op
.
name
()
+
"_kernel"
}})
)
;
{{
"lambda"
,
"[](auto x) { return x; }"
},
{
"kernel"
,
op
.
name
()
+
"_kernel"
}});
}
else
{
...
...
@@ -105,10 +105,9 @@ struct pointwise_compiler : compiler<pointwise_compiler>
auto
pf
=
generate_pointwise
(
*
pm
,
"inner_pointwise"
);
std
::
string
lambda
=
"MIGRAPHX_LIFT(inner_pointwise)"
;
auto
kernel_name
=
generate_name_from_ops
(
*
pm
)
+
"_kernel"
;
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
{{
"lambda"
,
lambda
},
{
"preamble"
,
pf
},
{
"kernel"
,
kernel_name
}}));
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
{{
"lambda"
,
lambda
},
{
"preamble"
,
pf
},
{
"kernel"
,
kernel_name
}});
}
}
};
...
...
src/targets/gpu/jit/reduce.cpp
View file @
23cb7917
...
...
@@ -45,7 +45,7 @@ namespace migraphx {
${preamble}
extern "C" {
__global__
void reduce_kernel(void* input_p, void* output_p)
MIGRAPHX_GLOBAL
void reduce_kernel(void* input_p, void* output_p)
{
transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
...
...
@@ -84,7 +84,7 @@ static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
std
::
fill
(
lens
.
begin
(),
lens
.
end
(),
1
);
for
(
const
auto
&
axis
:
axes
)
lens
[
axis
]
=
s
.
lens
()[
axis
];
return
s
hape
{
s
.
type
(),
lens
}
;
return
s
.
with_lens
(
lens
)
;
}
template
<
class
T
>
...
...
@@ -93,7 +93,7 @@ static shape get_output_shape(const shape& s, const std::vector<T>& axes)
auto
lens
=
s
.
lens
();
for
(
const
auto
&
axis
:
axes
)
lens
[
axis
]
=
1
;
return
s
hape
{
s
.
type
(),
lens
}
;
return
s
.
with_lens
(
lens
)
;
}
template
<
class
ReduceLens
>
...
...
@@ -189,7 +189,7 @@ struct simple_reduce_compiler : compiler<simple_reduce_compiler>
v
[
"read"
]
=
r
.
read
;
v
[
"write"
]
=
r
.
write
;
v
[
"init"
]
=
r
.
init
;
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
)
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
);
}
};
...
...
@@ -228,7 +228,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
auto
virtual_inputs
=
inputs
;
virtual_inputs
.
push_back
(
get_reduced_shape
(
inputs
.
front
(),
axes
));
virtual_inputs
.
push_back
(
get_output_shape
(
inputs
.
front
(),
axes
));
virtual_inputs
=
reduce_dims
(
virtual_inputs
);
virtual_inputs
=
reduce_dims
(
normalize_permutation
(
virtual_inputs
)
)
;
auto
reduce_output_shape
=
virtual_inputs
.
back
();
virtual_inputs
.
pop_back
();
auto
reduction_shape
=
virtual_inputs
.
back
();
...
...
@@ -285,7 +285,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
v
[
"preamble"
]
=
generate_reduce
(
*
rm
,
"fused_reduce_op"
);
v
[
"lambda"
]
=
"MIGRAPHX_LIFT(fused_reduce_op)"
;
v
[
"kernel"
]
=
generate_name_from_ops
(
*
rm
)
+
"_kernel"
;
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
)
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
);
}
};
}
// namespace gpu
...
...
src/targets/gpu/jit/roialign.cpp
View file @
23cb7917
...
...
@@ -41,7 +41,7 @@ namespace migraphx {
extern "C" {
__global__
void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y)
MIGRAPHX_GLOBAL
void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y)
{
make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) {
auto settings = make_roalign_settings(MIGRAPHX_MAKE_CONSTANT(float{ROIS_OFFSET}),
...
...
@@ -92,7 +92,7 @@ struct roialign_compiler : compiler<roialign_compiler>
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
())
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
());
}
};
...
...
src/targets/gpu/jit/scatternd.cpp
View file @
23cb7917
...
...
@@ -42,7 +42,7 @@ namespace migraphx {
extern "C" {
__global__
void scatternd_kernel(void* in_indices, void* in_updates, void* output)
MIGRAPHX_GLOBAL
void scatternd_kernel(void* in_indices, void* in_updates, void* output)
{
make_tensors()(in_indices, in_updates, output)([](auto&&... xs) {
scatternd(xs..., ${reduction}{});
...
...
@@ -85,15 +85,15 @@ struct scatternd_compiler : compiler<scatternd_compiler>
{{
"reduction"
,
reduction
}}));
}
compiler_replace
insert
(
const
operation
&
o
p
)
const
compiler_replace
insert
(
const
operation
&
c
o
)
const
{
return
[
=
](
module
&
m
,
instruction_ref
ins
)
{
auto
args
=
ins
->
inputs
();
args
.
back
()
=
m
.
insert_instruction
(
ins
,
make_op
(
"hip::copy"
),
args
.
front
(),
args
.
back
());
args
.
erase
(
args
.
begin
());
return
m
.
replace_instruction
(
ins
,
op
,
args
);
};
return
{
co
,
[](
module
&
m
,
instruction_ref
ins
,
const
operation
&
op
)
{
auto
args
=
ins
->
inputs
();
args
.
back
()
=
m
.
insert_instruction
(
ins
,
make_op
(
"hip::copy"
),
args
.
front
(),
args
.
back
());
args
.
erase
(
args
.
begin
());
return
m
.
replace_instruction
(
ins
,
op
,
args
);
}
};
}
};
...
...
src/targets/gpu/jit/softmax.cpp
View file @
23cb7917
...
...
@@ -45,7 +45,7 @@ static const char* const softmax_kernel = R"__migraphx__(
namespace migraphx {
extern "C" {
__global__
void softmax_kernel(void* input_p, void* output_p)
MIGRAPHX_GLOBAL
void softmax_kernel(void* input_p, void* output_p)
{
transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
softmax<${axis}>(input, output);
...
...
@@ -95,7 +95,7 @@ struct softmax_compiler : compiler<softmax_compiler>
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
())
)
;
return
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
());
}
};
...
...
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
View file @
23cb7917
...
...
@@ -272,6 +272,18 @@ struct integral_const_array : array<T, sizeof...(Xs)>
MIGRAPHX_DEVICE_CONSTEXPR
integral_const_array
()
:
base_array
({
Xs
...})
{}
};
template
<
class
T
,
class
...
Ts
>
constexpr
auto
make_const_array
(
T
x
,
Ts
...
xs
)
{
return
integral_const_array
<
typename
T
::
value_type
,
x
,
xs
...
>
{};
}
template
<
class
T
,
T
...
Xs
,
class
F
>
constexpr
auto
unpack
(
integral_const_array
<
T
,
Xs
...
>
,
F
f
)
{
return
f
(
_c
<
Xs
>
...);
}
template
<
class
T
,
T
...
Xs
,
class
F
>
constexpr
auto
transform
(
integral_const_array
<
T
,
Xs
...
>
,
F
f
)
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
0 → 100644
View file @
23cb7917
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_KERNELS_CK_HPP
#define MIGRAPHX_GUARD_KERNELS_CK_HPP
#include <migraphx/kernels/debug.hpp>
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <ck/utility/common_header.hpp>
#include <ck/tensor_description/tensor_descriptor.hpp>
#include <ck/tensor_description/tensor_descriptor_helper.hpp>
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
namespace
migraphx
{
namespace
detail
{
template
<
class
T
>
struct
to_ck_type_impl
{
using
type
=
T
;
};
template
<
>
struct
to_ck_type_impl
<
migraphx
::
half
>
{
using
type
=
ck
::
half_t
;
};
template
<
class
T
>
struct
to_ck_type_impl
<
const
T
>
{
using
type
=
const
typename
to_ck_type_impl
<
T
>::
type
;
};
template
<
class
Shape
>
constexpr
bool
is_row_major
()
{
constexpr
auto
strides
=
Shape
{}.
strides
;
MIGRAPHX_ASSERT
(
strides
.
size
()
>=
2
);
if
(
strides
.
back
()
==
1
)
{
MIGRAPHX_ASSERT
(
not
Shape
{}.
is_transposed
());
return
true
;
}
MIGRAPHX_ASSERT
(
strides
[
strides
.
size
()
-
2
]
==
1
);
return
false
;
}
}
// namespace detail
template
<
class
T
>
using
to_ck_type
=
typename
detail
::
to_ck_type_impl
<
T
>::
type
;
template
<
class
T
>
constexpr
auto
to_ck_pointer
(
T
*
x
)
{
return
static_cast
<
to_ck_type
<
T
>*>
(
x
);
}
template
<
class
T
>
constexpr
auto
to_ck_const_pointer
(
const
T
*
x
)
{
return
static_cast
<
const
to_ck_type
<
T
>*>
(
x
);
}
template
<
class
Shape
>
using
to_ck_gemm_layout
=
conditional_t
<
detail
::
is_row_major
<
get_shape_c
<
Shape
>>
(),
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
>
;
template
<
class
Tensor
>
constexpr
auto
to_ck_tensor
()
{
constexpr
auto
s
=
get_shape_c
<
Tensor
>
{};
return
sequence
(
s
.
lens
.
size
(),
[
&
](
auto
...
is
)
{
return
ck
::
make_naive_tensor_descriptor
(
ck
::
make_tuple
(
s
.
lens
[
is
]...),
ck
::
make_tuple
(
s
.
strides
[
is
]...));
});
}
template
<
class
F
>
struct
ck_function_adaptor
:
F
{
template
<
class
...
Ts
>
constexpr
ck_function_adaptor
(
Ts
&&
...
xs
)
:
F
(
static_cast
<
Ts
&&>
(
xs
)...)
{
}
template
<
class
T
,
class
...
Ts
>
constexpr
void
operator
()(
T
&
out
,
Ts
&&
...
xs
)
const
{
out
=
static_cast
<
const
F
&>
(
*
this
)(
static_cast
<
Ts
&&>
(
xs
)...);
}
};
struct
ck_nop
{
template
<
class
T
>
constexpr
void
operator
()(
T
&
)
const
{
}
};
struct
ck_passthrough
{
template
<
class
T
,
class
U
>
constexpr
void
operator
()(
T
&
y
,
U
x
)
const
{
y
=
x
;
}
};
struct
ck_scale
{
constexpr
ck_scale
(
float
s
)
:
scale
(
s
)
{}
template
<
class
T
,
class
U
>
constexpr
void
operator
()(
T
&
y
,
U
x
)
const
{
y
=
x
*
static_cast
<
U
>
(
scale
);
}
float
scale
;
};
struct
ck_add
{
template
<
class
T
,
class
U
>
constexpr
void
operator
()(
T
&
y
,
U
x
)
const
{
y
+=
x
;
}
};
#ifdef MIGRAPHX_CK_CHECK
#define MIGRAPHX_CK_STATIC_ASSERT static_assert
#else
#define MIGRAPHX_CK_STATIC_ASSERT(...)
#endif
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_CK_HPP
src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
0 → 100644
View file @
23cb7917
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/algorithm.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/ck.hpp>
#include <migraphx/kernels/gemm_batcher.hpp>
namespace
migraphx
{
// In CK, the B matrix is ordered as N,K instead of K,N
template
<
class
Dims
>
constexpr
auto
ck_transposeb_dims
(
Dims
dims
)
{
return
unpack
(
dims
,
[](
auto
k
,
auto
n
)
{
return
make_const_array
(
n
,
k
);
});
}
template
<
class
Tensor
>
using
ck_transposeb
=
decltype
(
make_shape
(
ck_transposeb_dims
(
get_shape_c
<
Tensor
>
{}.
lens
),
ck_transposeb_dims
(
get_shape_c
<
Tensor
>
{}.
strides
)));
template
<
class
G
,
class
E
,
class
A
,
class
B
,
class
...
Ds
>
__device__
void
ck_gemm_matrix
(
E
e
,
A
a
,
B
b
,
Ds
...
ds
)
{
constexpr
auto
desc
=
G
::
make_descriptor
(
to_ck_tensor
<
A
>
(),
to_ck_tensor
<
ck_transposeb
<
B
>>
(),
ck
::
make_tuple
(
to_ck_tensor
<
Ds
>
()...),
to_ck_tensor
<
E
>
());
static_assert
(
desc
.
IsValid
(),
"Invalid ck gemm."
);
G
::
Run
(
desc
,
to_ck_const_pointer
(
a
.
data
()),
to_ck_const_pointer
(
b
.
data
()),
ck
::
make_tuple
(
to_ck_const_pointer
(
ds
.
data
())...),
to_ck_pointer
(
e
.
data
()));
}
template
<
class
G
,
index_int
BlocksPerBatch
,
class
...
Ts
>
__device__
void
ck_gemm
(
Ts
...
xs
)
{
gemm_batch_args
(
make_index
(),
_c
<
BlocksPerBatch
>
,
xs
...)(
[](
auto
...
ys
)
{
ck_gemm_matrix
<
G
>
(
ys
...);
});
}
}
// namespace migraphx
#endif
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
View file @
23cb7917
...
...
@@ -122,12 +122,14 @@ struct source_location_capture
{
T
x
;
source_location
loc
;
template
<
class
U
,
class
=
decltype
(
T
(
U
{}
))>
// declval is a workaround since default constructor for "U" is not working with rocm-5.6
template
<
class
U
>
static
U
&&
declval
();
template
<
class
U
,
class
=
decltype
(
T
(
declval
<
U
>()))
>
constexpr
source_location_capture
(
U
px
,
source_location
ploc
=
source_location
{})
:
x
(
px
),
loc
(
ploc
)
{
}
constexpr
operator
source_location
()
const
{
return
loc
;
}
constexpr
operator
T
()
const
{
return
x
;
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
View file @
23cb7917
...
...
@@ -32,8 +32,17 @@
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... private_lisft_xs) MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lisft_xs)>(private_lisft_xs)...))
[](auto&&... private_lifts_xs) MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT_CLASS(name, ...) \
struct name \
{ \
template <class... PrivateLiftTs> \
constexpr auto operator()(PrivateLiftTs&&... private_lifts_xs) const MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...)) \
}
namespace
migraphx
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
0 → 100644
View file @
23cb7917
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
#define MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/index.hpp>
namespace
migraphx
{
template
<
class
Tensor
>
constexpr
auto
gemm_get_batches
()
{
constexpr
auto
lens
=
get_shape_c
<
Tensor
>
{}.
lens
;
constexpr
auto
strides
=
get_shape_c
<
Tensor
>
{}.
strides
;
constexpr
auto
new_lens
=
sequence
(
lens
.
size
()
-
_c
<
2
>
,
[
&
](
auto
...
is
)
{
return
make_const_array
(
_c
<
lens
[
is
]
>
...);
});
constexpr
auto
new_strides
=
sequence
(
strides
.
size
()
-
_c
<
2
>
,
[
&
](
auto
...
is
)
{
return
make_const_array
(
_c
<
strides
[
is
]
>
...);
});
return
make_shape
(
new_lens
,
new_strides
);
}
template
<
class
Tensor
>
constexpr
auto
gemm_get_matrix
()
{
constexpr
auto
lens
=
get_shape_c
<
Tensor
>
{}.
lens
;
constexpr
auto
strides
=
get_shape_c
<
Tensor
>
{}.
strides
;
constexpr
auto
m
=
lens
.
size
()
-
_c
<
2
>
;
constexpr
auto
n
=
lens
.
size
()
-
_c
<
1
>
;
constexpr
auto
new_lens
=
make_const_array
(
_c
<
lens
[
m
]
>
,
_c
<
lens
[
n
]
>
);
constexpr
auto
new_strides
=
make_const_array
(
_c
<
strides
[
m
]
>
,
_c
<
strides
[
n
]
>
);
return
make_shape
(
new_lens
,
new_strides
);
}
template
<
class
Tensor
,
class
T
>
constexpr
auto
gemm_batch_slice
(
Tensor
t
,
T
i
)
{
constexpr
auto
batch
=
gemm_get_batches
<
Tensor
>
();
constexpr
auto
matrix
=
gemm_get_matrix
<
Tensor
>
();
MIGRAPHX_ASSERT
((
batch
.
index
(
i
)
+
matrix
.
element_space
())
<=
t
.
get_shape
().
element_space
());
return
make_tensor_view
(
t
.
data
()
+
batch
.
index
(
i
),
matrix
);
}
template
<
class
BlocksPerBatch
,
class
T
,
class
...
Ts
>
constexpr
auto
gemm_batch_args
(
index
idx
,
BlocksPerBatch
bpb
,
T
x
,
Ts
...
xs
)
{
return
[
=
](
auto
f
)
{
// All tensors should have the same rank
static_assert
(
(
true
and
...
and
(
get_shape_c
<
T
>
{}.
lens
.
size
()
==
get_shape_c
<
Ts
>
{}.
lens
.
size
())));
if
constexpr
(
get_shape_c
<
T
>
{}.
lens
.
size
()
>
2
)
{
// Get the first batch since all batches should have the same number of elements
constexpr
auto
batch
=
gemm_get_batches
<
T
>
();
static_assert
(
(
true
and
...
and
(
batch
.
elements
()
==
gemm_get_batches
<
Ts
>
().
elements
())));
idx
.
group_stride
(
bpb
*
batch
.
elements
(),
[
&
](
auto
gidx
)
{
const
auto
batch_idx
=
gidx
/
bpb
;
f
(
gemm_batch_slice
(
x
,
batch_idx
),
gemm_batch_slice
(
xs
,
batch_idx
)...);
});
}
else
{
f
(
x
,
xs
...);
}
};
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
View file @
23cb7917
...
...
@@ -28,10 +28,6 @@
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/math_functions.h>
#include <hip/hip_math_constants.h>
#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
#include <hip/hip_common.h>
#include <hip/hip_math_constants.h>
#endif
#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
View file @
23cb7917
...
...
@@ -130,6 +130,8 @@ struct index
return
blockDim
.
x
;
}
#endif
constexpr
auto
ngroup
()
const
{
return
nglobal
()
/
max_nlocal
();
}
template
<
class
N
,
class
Stride
>
static
constexpr
auto
max_stride_iterations
(
N
n
,
Stride
stride
)
{
...
...
@@ -231,6 +233,12 @@ struct index
{
for_stride
<
true
>
(
local
,
n
,
nlocal
(),
f
);
}
template
<
class
F
,
class
N
>
__device__
void
group_stride
(
N
n
,
F
f
)
const
{
for_stride
<
false
>
(
group
,
n
,
ngroup
(),
f
);
}
};
#ifdef MIGRAPHX_NLOCAL
...
...
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
23cb7917
...
...
@@ -138,7 +138,7 @@ MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
isnan
,
::
__hisnan
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
log
,
::
hlog
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
rsqrt
,
::
hrsqrt
)
//
MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
sin
,
::
hsin
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
sqrt
,
::
hsqrt
)
// Use float to compute half overload
...
...
@@ -161,8 +161,7 @@ MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
// Map math functions to hip half2 functions
// The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
// packed into a 32-bit number. See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
// Most but not all of these math ops have operators of the same names. Ones not yet implemented
// at this time are: exp2, exp10, log2, log10, isinf
// Most but not all of these math ops have operators of the same names.
MIGRAPHX_DEVICE_MATH_HALF2
(
abs
,
::
__habs2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
ceil
,
::
h2ceil
)
MIGRAPHX_DEVICE_MATH_HALF2
(
cos
,
::
h2cos
)
...
...
@@ -176,7 +175,7 @@ MIGRAPHX_DEVICE_MATH_HALF2(log, ::h2log)
MIGRAPHX_DEVICE_MATH_HALF2
(
log10
,
::
h2log10
)
MIGRAPHX_DEVICE_MATH_HALF2
(
log2
,
::
h2log2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
rsqrt
,
::
h2rsqrt
)
//
MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
MIGRAPHX_DEVICE_MATH_HALF2
(
sin
,
::
h2sin
)
MIGRAPHX_DEVICE_MATH_HALF2
(
sqrt
,
::
h2sqrt
)
template
<
class
T
,
class
U
>
...
...
@@ -189,9 +188,8 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, max, ::max)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
float
,
min
,
::
min
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
double
,
max
,
::
max
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
double
,
min
,
::
min
)
// Add overloads for half that calls the float version
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
max
,
::
fmaxf
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
min
,
::
fminf
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
max
,
::
__hmax
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
min
,
::
__hmin
)
template
<
class
T
,
MIGRAPHX_REQUIRES
(
not
is_any_vec
<
T
>())
>
constexpr
auto
max
(
const
T
&
a
,
const
T
&
b
)
...
...
@@ -217,14 +215,6 @@ constexpr auto min(const T& a, const U& b)
return
min
<
common_type_t
<
T
,
U
>>
(
a
,
b
);
}
// Sin for half is broken on hip, so use cos instead
template
<
class
T
,
MIGRAPHX_REQUIRES
(
is_same
<
vec_type
<
T
>,
half
>
{})
>
constexpr
T
sin
(
T
x
)
{
constexpr
const
T
shift
=
HIP_PIO2_F
;
return
migraphx
::
cos
(
shift
-
x
);
}
MIGRAPHX_DEVICE_MATH_VEC
(
abs
)
MIGRAPHX_DEVICE_MATH_VEC
(
acos
)
MIGRAPHX_DEVICE_MATH_VEC
(
acosh
)
...
...
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
View file @
23cb7917
...
...
@@ -244,13 +244,13 @@ __device__ void print_once(Ts... xs)
template
<
class
...
Ts
>
__device__
void
println
(
Ts
...
xs
)
{
print_each
(
&
cout
ln
,
xs
...);
print_each
(
&
cout
,
xs
...
,
'\n'
);
}
template
<
class
...
Ts
>
__device__
void
println_once
(
Ts
...
xs
)
{
print_each_once
(
&
cout
ln
,
xs
...);
print_each_once
(
&
cout
,
xs
...
,
'\n'
);
}
}
// namespace migraphx
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
View file @
23cb7917
...
...
@@ -79,20 +79,21 @@ __device__ void dpp_reduce(T& in, Op op)
#endif
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE(op, prefix
)
\
#define MIGRAPHX_DPP_REDUCE(op, prefix
, sign)
\
__device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
__device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); } \
__device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); } \
__device__ inline void dpp_reduce(int32_t& x, op) \
{ \
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##
_u32);
\
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##
sign##32);
\
} \
__device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
MIGRAPHX_DPP_REDUCE
(
op
::
sum
,
v_add
)
MIGRAPHX_DPP_REDUCE
(
op
::
max
,
v_max
)
MIGRAPHX_DPP_REDUCE
(
op
::
min
,
v_min
)
MIGRAPHX_DPP_REDUCE
(
op
::
product
,
v_mul
)
// Note: when max and min are in int32_t, signed version of instruction needs to be used.
MIGRAPHX_DPP_REDUCE
(
op
::
sum
,
v_add
,
_u
)
MIGRAPHX_DPP_REDUCE
(
op
::
product
,
v_mul
,
_u
)
MIGRAPHX_DPP_REDUCE
(
op
::
max
,
v_max
,
_i
)
MIGRAPHX_DPP_REDUCE
(
op
::
min
,
v_min
,
_i
)
template
<
class
Op
,
class
T
,
class
Index
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
Index
n
,
F
f
)
...
...
@@ -570,7 +571,7 @@ template <class Algo, class Reduced, class Output, class F>
__device__
void
fused_reduce
(
Output
output
,
F
f
)
{
Algo
::
template
run
<
Reduced
>([
&
](
auto
out_idx
,
auto
r
)
{
auto
result
=
f
(
r
);
auto
result
=
f
(
r
,
out_idx
);
if
constexpr
(
reduce
::
is_inner_storage
<
decltype
(
result
)
>
{})
{
r
.
inner
([
&
](
auto
&
y
,
auto
x
)
{
y
=
x
;
})(
output
,
result
);
...
...
Prev
1
…
12
13
14
15
16
17
18
19
20
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment