Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
f06f6aa3
Commit
f06f6aa3
authored
Jul 09, 2019
by
Shucai Xiao
Browse files
merge changes from develop branch
parents
80a35596
ebfe9735
Changes
82
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1165 additions
and
485 deletions
+1165
-485
src/shape.cpp
src/shape.cpp
+18
-0
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+179
-56
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+68
-53
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+7
-0
src/targets/gpu/argmax.cpp
src/targets/gpu/argmax.cpp
+23
-0
src/targets/gpu/argmin.cpp
src/targets/gpu/argmin.cpp
+23
-0
src/targets/gpu/device/argmax.cpp
src/targets/gpu/device/argmax.cpp
+23
-0
src/targets/gpu/device/argmin.cpp
src/targets/gpu/device/argmin.cpp
+23
-0
src/targets/gpu/device/concat.cpp
src/targets/gpu/device/concat.cpp
+12
-12
src/targets/gpu/device/erf.cpp
src/targets/gpu/device/erf.cpp
+18
-0
src/targets/gpu/device/gather.cpp
src/targets/gpu/device/gather.cpp
+18
-23
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+68
-0
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
...targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+42
-10
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+95
-271
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
...targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+248
-0
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+105
-0
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
...targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+13
-56
src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
...ts/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+59
-0
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+47
-4
src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
...targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+76
-0
No files found.
src/shape.cpp
View file @
f06f6aa3
...
...
@@ -138,6 +138,24 @@ std::size_t shape::index(std::size_t i) const
return
result
;
}
}
std
::
vector
<
std
::
size_t
>
shape
::
multi
(
std
::
size_t
i
)
const
{
assert
(
this
->
standard
());
std
::
vector
<
std
::
size_t
>
indices
(
lens
().
size
());
std
::
transform
(
strides
().
begin
(),
strides
().
end
(),
lens
().
begin
(),
indices
.
begin
(),
[
&
](
std
::
size_t
stride
,
std
::
size_t
len
)
{
assert
(
len
>
0
and
stride
>
0
);
return
(
i
/
stride
)
%
len
;
});
return
indices
;
}
bool
shape
::
packed
()
const
{
return
this
->
elements
()
==
this
->
element_space
();
}
bool
shape
::
transposed
()
const
...
...
src/simplify_reshapes.cpp
View file @
f06f6aa3
...
...
@@ -2,14 +2,17 @@
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/op/as_shape.hpp>
#include <migraphx/op/transpose.hpp>
#include <migraphx/op/concat.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/matcher.hpp>
#include <unordered_set>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
bool
is_reshaper
(
instruction_ref
ins
)
const
auto
&
reshaper_names
(
)
{
// clang-format off
static
const
std
::
unordered_set
<
std
::
string
>
names
=
{
...
...
@@ -19,17 +22,10 @@ bool is_reshaper(instruction_ref ins)
"unsqueeze"
};
// clang-format on
return
contains
(
names
,
ins
->
name
())
;
return
names
;
}
bool
is_transpose_output
(
instruction_ref
ins
)
{
if
(
ins
->
outputs
().
size
()
!=
1
)
return
false
;
if
(
ins
->
outputs
().
front
()
->
name
()
==
"contiguous"
)
return
is_transpose_output
(
ins
->
outputs
().
front
());
return
ins
->
outputs
().
front
()
->
name
()
==
"transpose"
;
}
bool
is_reshaper
(
instruction_ref
ins
)
{
return
contains
(
reshaper_names
(),
ins
->
name
());
}
instruction_ref
find_transpose_input
(
instruction_ref
ins
)
{
...
...
@@ -42,62 +38,189 @@ instruction_ref find_transpose_input(instruction_ref ins)
return
ins
;
}
void
simplify_reshapes
::
apply
(
program
&
p
)
co
ns
t
auto
get_transpose_dims
(
instruction_ref
i
ns
)
{
auto
end
=
std
::
prev
(
p
.
end
());
for
(
auto
ins
:
iterator_for
(
p
))
return
any_cast
<
const
op
::
transpose
&>
(
ins
->
get_operator
()).
dims
;
}
std
::
vector
<
int64_t
>
reorder_dims
(
std
::
vector
<
int64_t
>
dims
,
std
::
vector
<
int64_t
>
permutation
)
{
std
::
vector
<
int64_t
>
result
(
dims
.
size
());
assert
(
dims
.
size
()
==
permutation
.
size
());
for
(
std
::
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
if
(
ins
==
end
and
ins
->
name
()
==
"contiguous"
)
continue
;
// Skip possible dead instructions
if
(
ins
->
outputs
().
empty
()
and
ins
!=
end
)
continue
;
if
(
is_reshaper
(
ins
))
result
[
i
]
=
dims
[
permutation
[
i
]];
}
return
result
;
}
bool
is_no_transpose
(
const
std
::
vector
<
int64_t
>&
dims
)
{
if
(
dims
.
empty
())
return
true
;
if
(
dims
.
front
()
!=
0
)
return
false
;
return
std
::
adjacent_find
(
dims
.
begin
(),
dims
.
end
(),
[](
auto
x
,
auto
y
)
{
return
(
y
-
x
)
!=
1
;
})
==
dims
.
end
();
}
template
<
class
Vector
,
class
Op
>
std
::
vector
<
int64_t
>
sort_permutation
(
const
Vector
&
data
,
Op
op
)
{
std
::
vector
<
std
::
int64_t
>
result
(
data
.
size
());
std
::
iota
(
result
.
begin
(),
result
.
end
(),
0
);
std
::
sort
(
result
.
begin
(),
result
.
end
(),
[
&
](
auto
x
,
auto
y
)
{
return
op
(
data
[
x
],
data
[
y
]);
});
return
result
;
}
std
::
vector
<
int64_t
>
invert_permutation
(
const
std
::
vector
<
int64_t
>&
permutation
)
{
return
sort_permutation
(
permutation
,
std
::
less
<>
{});
}
std
::
vector
<
int64_t
>
find_permutation
(
const
shape
&
s
)
{
return
sort_permutation
(
s
.
strides
(),
std
::
greater
<>
{});
}
struct
find_reshaper
{
auto
matcher
()
const
{
return
match
::
name
(
reshaper_names
())(
match
::
any_of
[
match
::
outputs
()](
match
::
name
(
reshaper_names
())));
}
void
apply
(
program
&
p
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
std
::
vector
<
instruction_ref
>
reshapes
{
ins
};
while
(
is_reshaper
(
reshapes
.
back
()))
{
if
(
std
::
any_of
(
ins
->
outputs
().
begin
(),
ins
->
outputs
().
end
(),
&
is_reshaper
))
continue
;
// Gather reshapes
std
::
vector
<
instruction_ref
>
reshapes
{
ins
};
while
(
is_reshaper
(
reshapes
.
back
()))
{
assert
(
!
reshapes
.
back
()
->
inputs
().
empty
());
assert
(
p
.
has_instruction
(
reshapes
.
back
()
->
inputs
().
front
()));
auto
input
=
reshapes
.
back
()
->
inputs
().
front
();
reshapes
.
push_back
(
input
);
}
assert
(
!
reshapes
.
back
()
->
inputs
().
empty
());
assert
(
p
.
has_instruction
(
reshapes
.
back
()
->
inputs
().
front
()));
auto
input
=
reshapes
.
back
()
->
inputs
().
front
();
reshapes
.
push_back
(
input
);
}
std
::
pair
<
instruction_ref
,
instruction_ref
>
r
{
p
.
end
(),
p
.
end
()};
for
(
auto
start
:
iterator_for
(
reshapes
))
{
auto
last
=
std
::
find_if
(
reshapes
.
rbegin
(),
reshapes
.
rend
(),
[
&
](
auto
&&
i
)
{
return
i
->
get_shape
()
==
(
*
start
)
->
get_shape
()
and
i
!=
(
*
start
);
});
if
(
last
!=
reshapes
.
rend
())
{
r
=
std
::
make_pair
(
*
start
,
*
last
);
break
;
}
}
if
(
r
.
first
!=
r
.
second
)
std
::
pair
<
instruction_ref
,
instruction_ref
>
r
{
p
.
end
(),
p
.
end
()};
for
(
auto
start
:
iterator_for
(
reshapes
))
{
auto
last
=
std
::
find_if
(
reshapes
.
rbegin
(),
reshapes
.
rend
(),
[
&
](
auto
&&
i
)
{
return
i
->
get_shape
()
==
(
*
start
)
->
get_shape
()
and
i
!=
(
*
start
);
});
if
(
last
!=
reshapes
.
rend
())
{
p
.
replace_instruction
(
r
.
first
,
r
.
second
);
r
=
std
::
make_pair
(
*
start
,
*
last
);
break
;
}
}
else
if
(
ins
->
name
()
==
"transpose"
)
if
(
r
.
first
!=
r
.
second
)
{
p
.
replace_instruction
(
r
.
first
,
r
.
second
);
}
}
};
struct
find_nop_reshapes
{
auto
matcher
()
const
{
auto
reshapes
=
reshaper_names
();
reshapes
.
insert
(
"transpose"
);
reshapes
.
insert
(
"slice"
);
return
match
::
name
(
reshapes
)(
match
::
same_shape
(
match
::
arg
(
0
)));
}
void
apply
(
program
&
p
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
p
.
replace_instruction
(
ins
,
ins
->
inputs
().
front
());
}
};
struct
find_transpose
{
auto
matcher
()
const
{
return
match
::
name
(
"transpose"
)(
match
::
none_of
(
match
::
skip_output
(
match
::
name
(
"contiguous"
))(
match
::
name
(
"transpose"
))));
}
void
apply
(
program
&
p
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
x
=
ins
;
auto
t
=
ins
;
std
::
vector
<
std
::
int64_t
>
dims
(
ins
->
get_shape
().
lens
().
size
());
std
::
iota
(
dims
.
begin
(),
dims
.
end
(),
0
);
do
{
dims
=
reorder_dims
(
get_transpose_dims
(
t
),
dims
);
x
=
t
;
t
=
find_transpose_input
(
x
);
}
while
(
x
!=
t
and
t
->
name
()
==
"transpose"
);
if
(
t
==
ins
or
t
->
name
()
!=
"transpose"
)
return
;
if
(
is_no_transpose
(
dims
))
{
if
(
is_transpose_output
(
ins
))
continue
;
auto
x
=
ins
;
auto
t
=
ins
;
do
{
x
=
t
;
t
=
find_transpose_input
(
x
);
}
while
(
x
!=
t
and
t
->
name
()
==
"transpose"
);
if
(
t
==
ins
or
t
->
name
()
!=
"transpose"
)
continue
;
p
.
replace_instruction
(
ins
,
t
->
inputs
().
front
());
}
else
{
p
.
replace_instruction
(
ins
,
op
::
transpose
{{
dims
}},
t
->
inputs
().
front
());
}
}
};
struct
find_concat_transpose
{
auto
matcher
()
const
{
return
match
::
name
(
"concat"
)(
match
::
same_input_shapes
(),
match
::
all_of
[
match
::
inputs
()](
match
::
transpose_shape
()));
}
void
apply
(
program
&
p
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
s
=
ins
->
inputs
().
front
()
->
get_shape
();
assert
(
s
.
transposed
());
auto
op
=
any_cast
<
op
::
concat
>
(
ins
->
get_operator
());
auto
permutation
=
find_permutation
(
s
);
auto
ipermutation
=
invert_permutation
(
permutation
);
op
.
axis
=
ipermutation
[
op
.
axis
];
std
::
vector
<
instruction_ref
>
inputs
;
std
::
transform
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
std
::
back_inserter
(
inputs
),
[
&
](
auto
i
)
{
if
(
i
->
name
()
==
"transpose"
and
i
->
inputs
().
front
()
->
get_shape
().
standard
())
return
i
->
inputs
().
front
();
return
p
.
insert_instruction
(
ins
,
op
::
transpose
{
permutation
},
i
);
});
auto
concat
=
p
.
insert_instruction
(
ins
,
op
,
inputs
);
auto
t
=
p
.
insert_instruction
(
ins
,
op
::
transpose
{
ipermutation
},
concat
);
assert
(
ins
->
get_shape
().
lens
()
==
t
->
get_shape
().
lens
());
p
.
replace_instruction
(
ins
,
t
);
}
};
void
simplify_reshapes
::
apply
(
program
&
p
)
const
{
auto
end
=
std
::
prev
(
p
.
end
());
for
(
auto
ins
:
iterator_for
(
p
))
{
if
(
ins
==
end
and
ins
->
name
()
==
"contiguous"
)
continue
;
// Skip possible dead instructions
if
(
ins
->
outputs
().
empty
()
and
ins
!=
end
)
continue
;
match
::
find_matches
(
p
,
ins
,
find_nop_reshapes
{},
find_reshaper
{},
find_transpose
{},
find_concat_transpose
{});
}
}
...
...
src/targets/cpu/lowering.cpp
View file @
f06f6aa3
...
...
@@ -2,7 +2,21 @@
#include <migraphx/cpu/lowering.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/op/batch_norm.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/op/elu.hpp>
#include <migraphx/op/im2col.hpp>
#include <migraphx/op/leaky_relu.hpp>
#include <migraphx/op/logsoftmax.hpp>
#include <migraphx/op/lrn.hpp>
#include <migraphx/op/pad.hpp>
#include <migraphx/op/pooling.hpp>
#include <migraphx/op/softmax.hpp>
#include <migraphx/op/argmax.hpp>
#include <migraphx/op/argmin.hpp>
#include <migraphx/shape_for_each.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/par_dfor.hpp>
...
...
@@ -650,18 +664,11 @@ struct cpu_softmax
std
::
string
name
()
const
{
return
"cpu::softmax"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
template
<
typename
T
>
std
::
size_t
compute_batch_index
(
T
idx
,
shape
&
batch_shape
,
int
axis
)
const
{
idx
[
axis
]
=
0
;
return
batch_shape
.
index
(
idx
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
std
::
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
...
@@ -669,26 +676,33 @@ struct cpu_softmax
using
value_type
=
typename
decltype
(
input
)
::
value_type
;
std
::
vector
<
value_type
>
batch_max
(
batch_shape
.
elements
(),
std
::
numeric_limits
<
value_type
>::
lowest
());
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_max
[
index
]
=
std
::
max
(
batch_max
[
index
],
input
(
idx
.
begin
(),
idx
.
end
()));
});
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
batch_shape
.
multi
(
i
);
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
=
std
::
exp
(
input
(
idx
.
begin
(),
idx
.
end
())
-
batch_max
[
index
]);
});
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
std
::
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
std
::
exp
(
input
[
index
]
-
batch_max
[
i
]);
}
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
)
;
batch_sum
[
i
ndex
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
});
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
index
];
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
i
];
}
});
});
...
...
@@ -708,49 +722,50 @@ struct cpu_logsoftmax
std
::
string
name
()
const
{
return
"cpu::logsoftmax"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
template
<
typename
T
>
std
::
size_t
compute_batch_index
(
T
idx
,
const
shape
&
batch_shape
,
int
axis
)
const
{
idx
[
axis
]
=
0
;
return
batch_shape
.
index
(
idx
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
std
::
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
// use a parallel implementation to acheive better performance
// one thread for one batch
visit_all
(
result
,
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
using
value_type
=
typename
decltype
(
input
)
::
value_type
;
std
::
vector
<
value_type
>
batch_max
(
batch_shape
.
elements
(),
std
::
numeric_limits
<
value_type
>::
lowest
());
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_max
[
index
]
=
std
::
max
(
batch_max
[
index
],
input
(
idx
.
begin
(),
idx
.
end
()));
});
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
=
input
(
idx
.
begin
(),
idx
.
end
())
-
batch_max
[
index
];
});
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
batch_shape
.
multi
(
i
);
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_sum
[
index
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
});
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
std
::
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
input
[
index
]
-
batch_max
[
i
];
}
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
}
for
(
std
::
size_t
i
=
0
;
i
<
batch_sum
.
size
();
++
i
)
{
batch_sum
[
i
]
=
std
::
log
(
batch_sum
[
i
]);
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
index
];
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
i
];
}
});
});
...
...
src/targets/gpu/CMakeLists.txt
View file @
f06f6aa3
...
...
@@ -12,9 +12,12 @@ endif()
add_library
(
migraphx_device
device/add.cpp
device/argmax.cpp
device/argmin.cpp
device/max.cpp
device/min.cpp
device/exp.cpp
device/erf.cpp
device/log.cpp
device/sin.cpp
device/cos.cpp
...
...
@@ -36,6 +39,7 @@ add_library(migraphx_device
device/sub.cpp
device/pack.cpp
device/clip.cpp
device/reduce_sum.cpp
)
set_target_properties
(
migraphx_device PROPERTIES EXPORT_NAME device
)
rocm_clang_tidy_check
(
migraphx_device
)
...
...
@@ -44,6 +48,8 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
target_include_directories
(
migraphx_device PRIVATE $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/device/include>
)
add_library
(
migraphx_gpu
argmax.cpp
argmin.cpp
eliminate_workspace.cpp
fuse_ops.cpp
hip.cpp
...
...
@@ -73,6 +79,7 @@ add_library(migraphx_gpu
schedule_model.cpp
adjust_allocation.cpp
clip.cpp
reduce_sum.cpp
)
set_target_properties
(
migraphx_gpu PROPERTIES EXPORT_NAME gpu
)
rocm_clang_tidy_check
(
migraphx_gpu
)
...
...
src/targets/gpu/argmax.cpp
0 → 100644
View file @
f06f6aa3
#include <migraphx/gpu/argmax.hpp>
#include <migraphx/gpu/device/argmax.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
shape
hip_argmax
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
).
standard
();
return
op
.
compute_shape
({
inputs
.
at
(
0
)});
}
argument
hip_argmax
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
device
::
argmax
(
ctx
.
get_stream
().
get
(),
args
.
back
(),
args
.
front
(),
op
.
axis
);
return
args
.
back
();
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/argmin.cpp
0 → 100644
View file @
f06f6aa3
#include <migraphx/gpu/argmin.hpp>
#include <migraphx/gpu/device/argmin.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
shape
hip_argmin
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
).
standard
();
return
op
.
compute_shape
({
inputs
.
at
(
0
)});
}
argument
hip_argmin
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
device
::
argmin
(
ctx
.
get_stream
().
get
(),
args
.
back
(),
args
.
front
(),
op
.
axis
);
return
args
.
back
();
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/argmax.cpp
0 → 100644
View file @
f06f6aa3
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/argmax.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/arg_op.hpp>
#include <migraphx/gpu/hip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
argmax
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int64_t
axis
)
{
arg_op
(
argmax_op
{},
stream
,
result
,
arg
,
axis
);
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/argmin.cpp
0 → 100644
View file @
f06f6aa3
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/argmin.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/arg_op.hpp>
#include <migraphx/gpu/hip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
argmin
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int64_t
axis
)
{
arg_op
(
argmin_op
{},
stream
,
result
,
arg
,
axis
);
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/concat.cpp
View file @
f06f6aa3
...
...
@@ -10,22 +10,22 @@ namespace gpu {
namespace
device
{
argument
concat
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
const
migraphx
::
shape
&
,
std
::
vector
<
migraphx
::
argument
>
args
,
std
::
vector
<
std
::
size_t
>
offsets
)
{
for
(
std
::
size_t
l
=
0
;
l
<
args
.
size
()
-
1
;
l
++
)
auto
ninputs
=
args
.
size
()
-
1
;
for
(
std
::
size_t
j
=
0
;
j
<
ninputs
;
j
++
)
{
auto
argl
=
args
[
l
];
std
::
size_t
nelements
=
argl
.
get_shape
().
elements
();
visit_all
(
args
.
back
(),
argl
)([
&
](
auto
output
,
auto
input
)
{
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
ndim
)
{
auto
*
outptr
=
output
.
data
()
+
offsets
[
l
];
const
auto
*
inptr
=
input
.
data
();
hip_tensor_descriptor
<
ndim
>
desc_input
(
input
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_output
(
output
.
get_shape
());
gs_launch
(
stream
,
nelements
)(
[
=
](
auto
i
)
{
outptr
[
desc_output
.
linear
(
desc_input
.
multi
(
i
))]
=
inptr
[
i
];
});
auto
&&
arg
=
args
[
j
];
std
::
size_t
nelements
=
arg
.
get_shape
().
elements
();
auto
offset
=
offsets
[
j
];
shape
arg_shape
{
arg
.
get_shape
().
type
(),
arg
.
get_shape
().
lens
()};
hip_visit_all
(
args
.
back
(),
arg
,
arg_shape
)([
&
](
auto
output
,
auto
input
,
auto
input_shape
)
{
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
input_idx
=
input_shape
.
multi
(
i
);
auto
idx
=
output
.
get_shape
().
index
(
input_idx
);
output
.
data
()[
idx
+
offset
]
=
input
[
input_idx
];
});
});
}
...
...
src/targets/gpu/device/erf.cpp
0 → 100644
View file @
f06f6aa3
#include <migraphx/gpu/device/erf.hpp>
#include <migraphx/gpu/device/nary.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
erf
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
nary
(
stream
,
result
,
arg
)([](
auto
x
)
{
return
::
erf
(
to_hip_type
(
x
));
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/gather.cpp
View file @
f06f6aa3
...
...
@@ -11,35 +11,30 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
argument
gather
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
std
::
vector
<
migraphx
::
argument
>
args
,
int
axis
)
argument
gather
(
hipStream_t
stream
,
argument
result
,
argument
arg1
,
argument
arg2
,
int
axis
)
{
auto
axis_index
=
(
axis
<
0
)
?
(
axis
+
args
[
0
].
get_shape
().
lens
().
size
())
:
axis
;
visit_all
(
args
.
back
(),
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
args
[
1
].
visit
([
&
](
auto
indices
)
{
const
auto
*
indices_ptr
=
device_cast
(
indices
.
data
());
auto
*
out_ptr
=
device_cast
(
output
.
data
());
const
auto
*
in_ptr
=
device_cast
(
input
.
data
());
auto
&
input_shape
=
args
[
0
].
get_shape
();
auto
lens
=
input_shape
.
lens
();
lens
[
axis_index
]
=
args
[
1
].
get_shape
().
elements
();
migraphx
::
shape
out_comp_shape
{
output_shape
.
type
(),
lens
};
visit_tensor_size
(
out_comp_shape
.
lens
().
size
(),
[
&
](
auto
n_out_dim
)
{
hip_tensor_descriptor
<
n_out_dim
>
desc_input
(
input_shape
);
hip_tensor_descriptor
<
n_out_dim
>
desc_output
(
out_comp_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
auto
in_idx
=
desc_output
.
multi
(
ii
);
in_idx
[
axis_index
]
=
indices_ptr
[
in_idx
[
axis_index
]];
out_ptr
[
ii
]
=
in_ptr
[
desc_input
.
linear
(
in_idx
)];
auto
axis_index
=
(
axis
<
0
)
?
(
axis
+
arg1
.
get_shape
().
lens
().
size
())
:
axis
;
auto
&
input_shape
=
arg1
.
get_shape
();
auto
lens
=
input_shape
.
lens
();
lens
[
axis_index
]
=
arg2
.
get_shape
().
elements
();
shape
out_comp_shape
{
result
.
get_shape
().
type
(),
lens
};
std
::
size_t
nelements
=
result
.
get_shape
().
elements
();
visit_all
(
result
,
arg1
)([
&
](
auto
output
,
auto
input_v
)
{
hip_visit_views
(
input_v
,
out_comp_shape
)([
&
](
auto
input
,
auto
out_comp
)
{
arg2
.
visit
([
&
](
auto
indices
)
{
const
auto
*
indices_ptr
=
device_cast
(
indices
.
data
());
auto
*
output_ptr
=
device_cast
(
output
.
data
());
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
idx
=
out_comp
.
multi
(
i
);
idx
[
axis_index
]
=
indices_ptr
[
idx
[
axis_index
]];
output_ptr
[
i
]
=
input
[
idx
];
});
});
});
});
return
args
.
back
()
;
return
result
;
}
}
// namespace device
...
...
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
0 → 100644
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_array
{
T
d
[
N
];
MIGRAPHX_DEVICE_CONSTEXPR
T
&
operator
[](
std
::
size_t
i
)
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
data
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
data
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
integral_constant
<
std
::
size_t
,
N
>
size
()
const
{
return
{};
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
begin
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
end
()
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
end
()
const
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
T
dot
(
const
hip_array
&
x
)
const
{
T
result
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
+=
x
[
i
]
*
d
[
i
];
return
result
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
product
()
const
{
T
result
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
*=
d
[
i
];
return
result
;
}
friend
MIGRAPHX_DEVICE_CONSTEXPR
hip_array
operator
*
(
const
hip_array
&
x
,
const
hip_array
&
y
)
{
hip_array
result
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
[
i
]
=
x
[
i
]
*
y
[
i
];
return
result
;
}
friend
MIGRAPHX_DEVICE_CONSTEXPR
hip_array
operator
+
(
const
hip_array
&
x
,
const
hip_array
&
y
)
{
hip_array
result
{};
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
[
i
]
=
x
[
i
]
+
y
[
i
];
return
result
;
}
};
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
View file @
f06f6aa3
...
...
@@ -11,9 +11,33 @@ namespace device {
struct
index
{
std
::
size_t
global
;
std
::
size_t
local
;
std
::
size_t
group
;
std
::
size_t
global
=
0
;
std
::
size_t
local
=
0
;
std
::
size_t
group
=
0
;
__device__
std
::
size_t
nglobal
()
const
{
return
blockDim
.
x
*
gridDim
.
x
;
}
// NOLINT
__device__
std
::
size_t
nlocal
()
const
{
return
blockDim
.
x
;
}
// NOLINT
template
<
class
F
>
__device__
void
global_stride
(
std
::
size_t
n
,
F
f
)
const
{
const
auto
stride
=
nglobal
();
for
(
std
::
size_t
i
=
global
;
i
<
n
;
i
+=
stride
)
{
f
(
i
);
}
}
template
<
class
F
>
__device__
void
local_stride
(
std
::
size_t
n
,
F
f
)
const
{
const
auto
stride
=
nlocal
();
for
(
std
::
size_t
i
=
local
;
i
<
n
;
i
+=
stride
)
{
f
(
i
);
}
}
};
template
<
class
F
>
...
...
@@ -35,18 +59,26 @@ inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
};
}
template
<
class
F
>
__host__
__device__
auto
gs_invoke
(
F
&&
f
,
std
::
size_t
i
,
index
idx
)
->
decltype
(
f
(
i
,
idx
))
{
return
f
(
i
,
idx
);
}
template
<
class
F
>
__host__
__device__
auto
gs_invoke
(
F
&&
f
,
std
::
size_t
i
,
index
)
->
decltype
(
f
(
i
))
{
return
f
(
i
);
}
inline
auto
gs_launch
(
hipStream_t
stream
,
std
::
size_t
n
,
std
::
size_t
local
=
1024
)
{
std
::
size_t
groups
=
1
+
n
/
local
;
std
::
size_t
groups
=
(
n
+
local
-
1
)
/
local
;
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
256
,
groups
)
*
local
;
return
[
=
](
auto
f
)
{
launch
(
stream
,
nglobal
,
local
)([
=
](
auto
idx
)
{
for
(
size_t
i
=
idx
.
global
;
i
<
n
;
i
+=
nglobal
)
{
f
(
i
);
}
});
launch
(
stream
,
nglobal
,
local
)(
[
=
](
auto
idx
)
{
idx
.
global_stride
(
n
,
[
&
](
auto
i
)
{
gs_invoke
(
f
,
i
,
idx
);
});
});
};
}
...
...
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
View file @
f06f6aa3
This diff is collapsed.
Click to expand it.
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
0 → 100644
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
struct
sum
{
template
<
class
T
,
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
x
+
y
;
}
};
struct
id
{
template
<
class
T
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
)
const
{
return
x
;
}
};
struct
max
{
template
<
class
T
,
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
x
>
y
?
x
:
y
;
}
};
struct
min
{
template
<
class
T
,
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
x
<
y
?
x
:
y
;
}
};
struct
lowest
{
template
<
class
T
>
operator
T
()
const
{
return
device_cast
(
std
::
numeric_limits
<
host_type
<
T
>>::
lowest
());
}
};
struct
highest
{
template
<
class
T
>
operator
T
()
const
{
return
device_cast
(
std
::
numeric_limits
<
host_type
<
T
>>::
max
());
}
};
#ifdef MIGRAPHX_NO_DPP
template
<
std
::
size_t
N
,
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
std
::
size_t
n
,
F
f
)
{
using
type
=
decltype
(
f
(
idx
.
local
));
MIGRAPHX_DEVICE_SHARED
type
buffer
[
N
];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
buffer
[
idx
.
local
]
=
x
;
__syncthreads
();
for
(
std
::
size_t
s
=
1
;
s
<
idx
.
nlocal
();
s
*=
2
)
{
const
std
::
size_t
index
=
2
*
s
*
idx
.
local
;
if
(
index
+
s
<
idx
.
nlocal
())
{
buffer
[
index
]
=
op
(
buffer
[
index
],
buffer
[
index
+
s
]);
}
__syncthreads
();
}
return
buffer
[
0
];
}
#else
constexpr
unsigned
int
dpp_row_shr
(
unsigned
int
x
)
{
return
0x110u
|
x
;
}
constexpr
unsigned
int
dpp_row_bcast
(
unsigned
int
x
)
{
unsigned
int
y
=
0
;
switch
(
x
)
{
case
15
:
y
=
0x142
;
break
;
case
31
:
y
=
0x143
;
break
;
default:
throw
std
::
runtime_error
(
"Unknown bcast"
);
}
return
y
;
}
template
<
unsigned
int
DppCtrl
,
unsigned
int
RowMask
=
0xf
,
unsigned
int
BankMask
=
0xf
,
bool
BoundCtrl
=
false
,
class
T
>
__device__
T
dpp_mov
(
T
&
x
)
{
static
const
std
::
size_t
n
=
sizeof
(
T
)
<
4
?
1
:
sizeof
(
T
)
/
4
;
union
type
{
uint32_t
reg
[
n
];
T
data
;
};
type
output
{};
type
input
{};
// cppcheck-suppress unreadVariable
input
.
data
=
x
;
for
(
std
::
size_t
i
=
0
;
i
<
n
;
i
++
)
{
output
.
reg
[
i
]
=
__llvm_amdgcn_move_dpp
(
input
.
reg
[
i
],
DppCtrl
,
RowMask
,
BankMask
,
BoundCtrl
);
}
return
output
.
data
;
}
template
<
class
T
,
class
Op
>
__device__
void
dpp_reduce
(
T
&
in
,
Op
op
)
{
T
out
{};
out
=
dpp_mov
<
dpp_row_shr
(
1
)
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
2
)
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
4
),
0xf
,
0xe
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
8
),
0xf
,
0xc
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_bcast
(
15
),
0xa
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_bcast
(
31
),
0xc
>
(
in
);
in
=
op
(
in
,
out
);
}
__device__
inline
void
dpp_reduce
(
float
&
x
,
sum
)
{
#ifdef MIGRAPHX_USE_CLANG_TIDY
(
void
)
x
;
#else
__asm__
volatile
(
"s_nop 4
\n
"
"v_add_f32 %0 %0 %0 row_shr:1
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_shr:2
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc
\n
"
"s_nop 1
\n
"
:
"=v"
(
x
)
:
"0"
(
x
));
#endif
}
template
<
std
::
size_t
N
,
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
std
::
size_t
n
,
F
f
)
{
using
type
=
decltype
(
f
(
idx
.
local
));
MIGRAPHX_DEVICE_SHARED
type
buffer
[
N
/
64
];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
dpp_reduce
(
x
,
op
);
const
auto
ldsidx
=
idx
.
local
/
64
;
if
((
idx
.
local
%
64
)
==
63
)
{
buffer
[
ldsidx
]
=
x
;
}
__syncthreads
();
type
y
=
init
;
for
(
std
::
size_t
i
=
0
;
i
<
idx
.
nlocal
()
/
64
;
i
++
)
{
y
=
op
(
y
,
buffer
[
i
]);
}
return
y
;
}
#endif
constexpr
std
::
size_t
compute_block_size
(
std
::
size_t
n
,
std
::
size_t
max_block_size
)
{
size_t
block_size
=
64
;
while
(
block_size
<
max_block_size
and
block_size
<
n
)
block_size
*=
2
;
return
block_size
;
}
template
<
class
Op
,
class
T
,
class
Input
,
class
Output
>
void
reduce
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
Op
op
,
T
init
,
Input
read_input
,
Output
read_output
)
{
auto
&&
output_shape
=
result
.
get_shape
();
auto
&&
input_shape
=
arg
.
get_shape
();
std
::
vector
<
std
::
size_t
>
reduce_lens
;
std
::
transform
(
output_shape
.
lens
().
begin
(),
output_shape
.
lens
().
end
(),
input_shape
.
lens
().
begin
(),
std
::
back_inserter
(
reduce_lens
),
[](
auto
x
,
auto
y
)
->
std
::
size_t
{
if
(
x
==
y
)
return
1
;
else
return
y
;
});
shape
reduce_slice
{
output_shape
.
type
(),
reduce_lens
};
hip_visit_all
(
result
,
arg
,
reduce_slice
)([
&
](
auto
output
,
auto
input
,
auto
reduce_shape
)
{
auto
nelements
=
result
.
get_shape
().
elements
();
auto
relements
=
reduce_slice
.
elements
();
const
std
::
size_t
max_block_size
=
256
;
const
std
::
size_t
block_size
=
compute_block_size
(
relements
,
max_block_size
);
gs_launch
(
stream
,
nelements
*
block_size
,
block_size
)([
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
out_idx
=
i
/
block_size
;
auto
base_idx
=
output
.
get_shape
().
multi
(
out_idx
);
auto
r
=
block_reduce
<
max_block_size
>
(
idx
,
op
,
init
,
relements
,
[
&
](
auto
j
)
__device__
{
auto
reduce_idx
=
reduce_shape
.
multi
(
j
);
return
read_input
(
input
[
reduce_idx
+
base_idx
]);
});
if
(
idx
.
local
==
0
)
output
.
data
()[
out_idx
]
=
read_output
(
r
);
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
0 → 100644
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#include <migraphx/gpu/device/array.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
std
::
size_t
N
>
struct
hip_shape
{
using
hip_index
=
hip_array
<
std
::
size_t
,
N
>
;
hip_array
<
std
::
size_t
,
N
>
lens
=
{};
hip_array
<
std
::
size_t
,
N
>
strides
=
{};
bool
standard
=
false
;
__device__
__host__
hip_shape
()
=
default
;
hip_shape
(
const
shape
&
s
)
:
standard
(
s
.
standard
())
{
assert
(
s
.
lens
().
size
()
==
N
);
assert
(
s
.
strides
().
size
()
==
N
);
std
::
copy
(
s
.
lens
().
begin
(),
s
.
lens
().
end
(),
lens
.
begin
());
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
.
begin
());
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
elements
()
const
{
return
lens
.
product
();
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
hip_index
x
)
const
{
return
x
.
dot
(
strides
);
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
std
::
initializer_list
<
std
::
size_t
>
x
)
const
{
std
::
size_t
idx
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
x
.
size
();
i
++
)
idx
+=
*
(
x
.
begin
()
+
i
)
*
strides
[
i
];
return
idx
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
std
::
size_t
i
)
const
{
if
(
this
->
standard
)
return
i
;
else
{
const
std
::
size_t
rank
=
this
->
lens
.
size
();
std
::
size_t
s
=
1
;
std
::
size_t
result
=
0
;
for
(
std
::
size_t
j
=
0
;
j
<
this
->
lens
.
size
();
j
++
)
{
const
std
::
size_t
k
=
rank
-
j
-
1
;
const
std
::
size_t
stride
=
this
->
strides
[
k
];
const
std
::
size_t
len
=
this
->
lens
[
k
];
const
std
::
size_t
slen
=
s
*
len
;
const
std
::
size_t
idx
=
(
i
%
slen
)
/
s
;
result
+=
stride
*
idx
;
s
=
slen
;
}
return
result
;
}
}
MIGRAPHX_DEVICE_CONSTEXPR
hip_index
multi
(
std
::
size_t
idx
)
const
{
hip_index
result
;
std
::
size_t
tidx
=
idx
;
for
(
std
::
size_t
is
=
0
;
is
<
result
.
size
();
is
++
)
{
result
[
is
]
=
tidx
/
strides
[
is
];
tidx
=
tidx
%
strides
[
is
];
}
return
result
;
}
MIGRAPHX_DEVICE_CONSTEXPR
hip_index
carry
(
hip_index
result
)
const
{
std
::
ptrdiff_t
rem
=
0
;
for
(
std
::
ptrdiff_t
i
=
result
.
size
()
-
1
;
i
>=
0
;
i
--
)
{
auto
z
=
result
[
i
]
+
rem
;
rem
=
z
-
std
::
ptrdiff_t
(
lens
[
i
])
+
1
;
if
(
rem
>
0
)
z
-=
rem
;
else
rem
=
0
;
result
[
i
]
=
z
;
}
return
result
;
}
};
template
<
std
::
size_t
N
>
hip_shape
<
N
>
make_hip_shape
(
const
shape
&
x
)
{
return
x
;
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#include <hip/hip_runtime.h>
#include <migraphx/functional.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
F
>
void
visit_tensor_size
(
std
::
size_t
n
,
F
f
)
{
switch
(
n
)
{
case
1
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
1
>
{});
break
;
}
case
2
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
2
>
{});
break
;
}
case
3
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
3
>
{});
break
;
}
case
4
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
4
>
{});
break
;
}
case
5
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
5
>
{});
break
;
}
default:
throw
std
::
runtime_error
(
"Unknown tensor size"
);
}
}
template
<
std
::
size_t
NDim
>
using
hip_tensor_index
=
hip_array
<
std
::
size_t
,
NDim
>
;
template
<
size_t
NDim
>
struct
hip_index
{
size_t
d
[
NDim
];
__device__
__host__
size_t
&
operator
[](
size_t
i
)
{
return
d
[
i
];
}
__device__
__host__
size_t
operator
[](
size_t
i
)
const
{
return
d
[
i
];
}
};
template
<
size_t
NDim
>
template
<
std
::
size_t
NDim
>
struct
hip_tensor_descriptor
{
__device__
__host__
hip_tensor_descriptor
()
=
default
;
...
...
@@ -63,12 +22,11 @@ struct hip_tensor_descriptor
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
);
}
__device__
__host__
hip_index
<
NDim
>
multi
(
size_t
idx
)
const
__device__
__host__
hip_
tensor_
index
<
NDim
>
multi
(
std
::
size_t
idx
)
const
{
hip_index
<
NDim
>
result
{};
size_t
tidx
=
idx
;
for
(
size_t
is
=
0
;
is
<
NDim
;
is
++
)
hip_tensor_index
<
NDim
>
result
{};
std
::
size_t
tidx
=
idx
;
for
(
std
::
size_t
is
=
0
;
is
<
NDim
;
is
++
)
{
result
[
is
]
=
tidx
/
strides
[
is
];
tidx
=
tidx
%
strides
[
is
];
...
...
@@ -76,16 +34,15 @@ struct hip_tensor_descriptor
return
result
;
}
__device__
__host__
size_t
linear
(
hip_index
<
NDim
>
s
)
const
__device__
__host__
std
::
size_t
linear
(
hip_tensor_index
<
NDim
>
s
)
const
{
size_t
idx
=
0
;
for
(
size_t
i
=
0
;
i
<
NDim
;
i
++
)
std
::
size_t
idx
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
NDim
;
i
++
)
idx
+=
s
[
i
]
*
strides
[
i
];
return
idx
;
}
size_t
lens
[
NDim
]
=
{};
size_t
strides
[
NDim
]
=
{};
std
::
size_t
lens
[
NDim
]
=
{};
std
::
size_t
strides
[
NDim
]
=
{};
};
}
// namespace device
...
...
src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
0 → 100644
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#include <migraphx/gpu/device/shape.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_tensor_view
{
using
value_type
=
T
;
using
hip_index
=
typename
hip_shape
<
N
>::
hip_index
;
__device__
__host__
hip_tensor_view
()
=
default
;
__host__
hip_tensor_view
(
tensor_view
<
T
>
x
)
:
d
(
x
.
data
()),
s
(
x
.
get_shape
())
{}
__host__
hip_tensor_view
(
T
*
x
,
const
shape
&
ss
)
:
d
(
x
),
s
(
ss
)
{}
MIGRAPHX_DEVICE_CONSTEXPR
const
hip_shape
<
N
>&
get_shape
()
const
{
return
s
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
size
()
const
{
return
s
.
elements
();
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
data
()
const
{
return
d
;
}
template
<
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
value_type
&
operator
[](
U
i
)
const
{
return
d
[
s
.
index
(
i
)];
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
end
()
const
{
return
d
+
size
();
}
private:
value_type
*
d
=
nullptr
;
hip_shape
<
N
>
s
{};
};
template
<
std
::
size_t
N
,
class
T
>
hip_tensor_view
<
T
,
N
>
make_hip_view
(
const
shape
&
s
,
T
*
x
)
{
return
{
x
,
s
};
}
template
<
std
::
size_t
N
,
class
T
>
hip_tensor_view
<
T
,
N
>
make_hip_view
(
tensor_view
<
T
>
x
)
{
return
{
x
};
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
View file @
f06f6aa3
...
...
@@ -8,14 +8,45 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#include <hip/hip_runtime.h>
#include <migraphx/half.hpp>
#include <migraphx/config.hpp>
#include <migraphx/tensor_view.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template
<
class
T
,
std
::
size_t
N
>
using
vec
=
T
__attribute__
((
ext_vector_type
(
N
)));
template
<
std
::
size_t
N
,
class
T
>
__device__
__host__
T
*
as_pointer
(
vec
<
T
,
N
>*
x
)
{
return
reinterpret_cast
<
T
*>
(
x
);
}
template
<
std
::
size_t
N
,
class
T
>
__device__
__host__
vec
<
T
,
N
>*
as_vec
(
T
*
x
)
{
return
reinterpret_cast
<
vec
<
T
,
N
>*>
(
x
);
}
template
<
std
::
size_t
N
,
class
T
>
tensor_view
<
vec
<
T
,
N
>>
as_vec
(
tensor_view
<
T
>
x
)
{
return
{
x
.
get_shape
(),
as_vec
<
N
>
(
x
.
data
())};
}
template
<
std
::
size_t
N
,
class
...
Ts
>
auto
pack_vec
(
Ts
...
xs
)
{
return
[
=
](
auto
f
,
std
::
size_t
n
)
{
return
f
(
as_vec
<
N
>
(
xs
)[
n
]...);
};
}
using
gpu_half
=
__fp16
;
namespace
detail
{
...
...
@@ -25,6 +56,12 @@ struct device_type
using
type
=
T
;
};
template
<
class
T
,
std
::
size_t
N
>
struct
device_type
<
vec
<
T
,
N
>>
{
using
type
=
vec
<
typename
device_type
<
T
>::
type
,
N
>
;
};
template
<
>
struct
device_type
<
half
>
{
...
...
@@ -38,7 +75,7 @@ struct host_type
};
template
<
>
struct
device
_type
<
gpu_half
>
struct
host
_type
<
gpu_half
>
{
using
type
=
half
;
};
...
...
@@ -54,7 +91,7 @@ using device_type = typename detail::device_type<T>::type;
template
<
class
T
>
host_type
<
T
>
host_cast
(
T
x
)
{
return
reinterpret_cast
<
host_type
<
T
>>
(
x
);
return
reinterpret_cast
<
const
host_type
<
T
>
&
>
(
x
);
}
template
<
class
T
>
...
...
@@ -76,13 +113,19 @@ device_type<T>* device_cast(T* x)
}
template
<
class
T
>
T
to_hip_type
(
T
x
)
tensor_view
<
device_type
<
T
>>
device_cast
(
tensor_view
<
T
>
x
)
{
return
{
x
.
get_shape
(),
reinterpret_cast
<
device_type
<
T
>*>
(
x
.
data
())};
}
template
<
class
T
>
__device__
__host__
T
to_hip_type
(
T
x
)
{
return
x
;
}
// Hip doens't support __fp16
inline
float
to_hip_type
(
gpu_half
x
)
{
return
x
;
}
inline
__device__
__host__
float
to_hip_type
(
gpu_half
x
)
{
return
x
;
}
}
// namespace device
}
// namespace gpu
...
...
src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
0 → 100644
View file @
f06f6aa3
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#include <migraphx/gpu/device/types.hpp>
#include <vector>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_vector
{
MIGRAPHX_DEVICE_CONSTEXPR
hip_vector
()
=
default
;
MIGRAPHX_DEVICE_CONSTEXPR
hip_vector
(
std
::
size_t
s
)
:
len
(
s
)
{}
template
<
class
Iterator
>
__device__
__host__
hip_vector
(
Iterator
start
,
Iterator
last
)
{
auto
it
=
std
::
copy
(
start
,
last
,
d
);
len
=
std
::
distance
(
d
,
it
);
}
__device__
__host__
hip_vector
(
std
::
initializer_list
<
T
>
x
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
d
);
len
=
x
.
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
operator
[](
std
::
size_t
i
)
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
front
()
{
return
d
[
0
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
front
()
const
{
return
d
[
0
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
back
()
{
return
d
[
size
()
-
1
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
back
()
const
{
return
d
[
size
()
-
1
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
data
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
data
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
size
()
const
{
return
len
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
begin
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
end
()
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
end
()
const
{
return
d
+
size
();
}
template
<
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
void
push_back
(
U
&&
x
)
{
d
[
len
]
=
static_cast
<
U
&&>
(
x
);
len
++
;
}
private:
T
d
[
N
]
=
{};
std
::
size_t
len
=
0
;
};
template
<
std
::
size_t
N
,
class
T
>
hip_vector
<
T
,
N
>
to_hip_vector
(
const
std
::
vector
<
T
>&
x
)
{
hip_vector
<
T
,
N
>
result
(
x
.
size
());
std
::
copy
(
x
.
begin
(),
x
.
end
(),
result
.
begin
());
return
result
;
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment