Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
992f57ba
Commit
992f57ba
authored
Apr 11, 2022
by
Shucai Xiao
Browse files
backup changes
parent
4f07b8f1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
39 additions
and
39 deletions
+39
-39
src/auto_contiguous.cpp
src/auto_contiguous.cpp
+18
-18
src/include/migraphx/op/reduce_op.hpp
src/include/migraphx/op/reduce_op.hpp
+1
-1
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+11
-11
src/targets/gpu/device/softmax.cpp
src/targets/gpu/device/softmax.cpp
+9
-9
No files found.
src/auto_contiguous.cpp
View file @
992f57ba
...
...
@@ -47,25 +47,25 @@ void auto_contiguous::apply(module& p) const
}
}
// if ops used as output param are alias 0, add a contiguous for the output
// so return outputs with standard shape
if
(
last
->
name
()
==
"@return"
)
{
auto
inputs
=
last
->
inputs
();
for
(
auto
ins
:
inputs
)
{
if
(
ins
->
name
()
==
"contiguous"
)
continue
;
//
//
if ops used as output param are alias 0, add a contiguous for the output
//
//
so return outputs with standard shape
//
if(last->name() == "@return")
//
{
//
auto inputs = last->inputs();
//
for(auto ins : inputs)
//
{
//
if(ins->name() == "contiguous")
//
continue;
auto
ins_alias
=
ins
->
get_operator
().
output_alias
({});
if
(
ins_alias
==
0
and
ins
->
get_shape
().
element_space
()
!=
ins
->
inputs
().
front
()
->
get_shape
().
element_space
())
{
auto
cont_ins
=
p
.
insert_instruction
(
last
,
make_op
(
"contiguous"
),
ins
);
p
.
replace_instruction
(
ins
,
cont_ins
);
}
}
}
//
auto ins_alias = ins->get_operator().output_alias({});
//
if(ins_alias == 0 and ins->get_shape().element_space() !=
//
ins->inputs().front()->get_shape().element_space())
//
{
//
auto cont_ins = p.insert_instruction(last, make_op("contiguous"), ins);
//
p.replace_instruction(ins, cont_ins);
//
}
//
}
//
}
}
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/include/migraphx/op/reduce_op.hpp
View file @
992f57ba
...
...
@@ -66,7 +66,7 @@ struct reduce_op : op_name<Derived>
{
value
normalize
;
normalize
[
"axes"
]
=
value
::
array
{
normalize_attribute
::
include_min
};
return
{{
"normalize_axes"
,
normalize
}
,
{
"std_shape"
,
true
}
};
return
{{
"normalize_axes"
,
normalize
}};
}
std
::
vector
<
int64_t
>
tune_axes
(
std
::
size_t
n_dim
)
const
...
...
src/simplify_reshapes.cpp
View file @
992f57ba
...
...
@@ -120,17 +120,17 @@ struct find_nop_reshapes
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
// output of reshape and contiguous is standard, so no need to add another contiguous
// if the output is used an a ret value
if
(
ins
->
name
()
==
"contiguous"
and
ins
->
name
()
!=
"contiguous"
and
ins
->
name
()
!=
"reshape"
)
{
auto
&
outputs
=
ins
->
outputs
();
if
(
std
::
any_of
(
outputs
.
begin
(),
outputs
.
end
(),
[
&
](
auto
o
)
{
return
o
->
name
()
==
"@return"
;
}))
{
return
;
}
}
//
//
output of reshape and contiguous is standard, so no need to add another contiguous
//
//
if the output is used an a ret value
//
if(ins->name() == "contiguous" and ins->name() != "contiguous" and ins->name() != "reshape")
//
{
//
auto& outputs = ins->outputs();
//
if(std::any_of(
//
outputs.begin(), outputs.end(), [&](auto o) { return o->name() == "@return"; }))
//
{
//
return;
//
}
//
}
p
.
replace_instruction
(
ins
,
ins
->
inputs
().
front
());
}
};
...
...
src/targets/gpu/device/softmax.cpp
View file @
992f57ba
...
...
@@ -36,7 +36,7 @@ struct half2_max
// in_data is in shared memory
template
<
class
Op
>
__device__
__half2
block_reduce
(
__half2
*
buffer
,
index_int
batch_item_num
,
index_int
tid
,
index_int
block_size
,
Op
op
)
block_reduce
_half2
(
__half2
*
buffer
,
index_int
batch_item_num
,
index_int
tid
,
index_int
block_size
,
Op
op
)
{
__syncthreads
();
for
(
index_int
s
=
block_size
;
s
>
0
;
s
>>=
1
)
...
...
@@ -55,7 +55,7 @@ block_reduce(__half2* buffer, index_int batch_item_num, index_int tid, index_int
}
__global__
void
softmax_kernel
(
void
*
data_in
,
index_int
batch_item_num
,
index_int
block_size
,
void
*
data_out
)
softmax_kernel
_half2
(
void
*
data_in
,
index_int
batch_item_num
,
index_int
block_size
,
void
*
data_out
)
{
__half2
*
input
=
reinterpret_cast
<
__half2
*>
(
data_in
);
__half2
*
output
=
reinterpret_cast
<
__half2
*>
(
data_out
);
...
...
@@ -73,7 +73,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
}
auto
batch_max
=
block_reduce
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_max
{});
block_reduce
_half2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_max
{});
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
...
...
@@ -82,7 +82,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
}
auto
batch_sum
=
block_reduce
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_sum
{});
block_reduce
_half2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_sum
{});
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
...
...
@@ -93,7 +93,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
// in_data is in shared memory
template
<
class
Op
>
__device__
__half
block_reduce
2
(
__half
*
data
,
index_int
batch_item_num
,
index_int
tid
,
index_int
block_size
,
Op
op
)
block_reduce
_half
(
__half
*
data
,
index_int
batch_item_num
,
index_int
tid
,
index_int
block_size
,
Op
op
)
{
__syncthreads
();
for
(
index_int
s
=
block_size
/
2
;
s
>
0
;
s
>>=
1
)
...
...
@@ -109,7 +109,7 @@ block_reduce2(__half* data, index_int batch_item_num, index_int tid, index_int b
}
__global__
void
softmax_kernel
2
(
void
*
data_in
,
index_int
batch_item_num
,
index_int
block_size
,
void
*
data_out
)
softmax_kernel
_half
(
void
*
data_in
,
index_int
batch_item_num
,
index_int
block_size
,
void
*
data_out
)
{
__half
*
input
=
reinterpret_cast
<
__half
*>
(
data_in
);
__half
*
output
=
reinterpret_cast
<
__half
*>
(
data_out
);
...
...
@@ -125,14 +125,14 @@ softmax_kernel2(void* data_in, index_int batch_item_num, index_int block_size, v
in_data_reduce
[
i
]
=
d
;
}
auto
batch_max
=
block_reduce
2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
max
{});
auto
batch_max
=
block_reduce
_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
max
{});
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
in_data
[
i
]
=
__float2half
(
::
exp
(
__half2float
(
in_data
[
i
])
-
__half2float
(
batch_max
)));
in_data_reduce
[
i
]
=
in_data
[
i
];
}
auto
batch_sum
=
block_reduce
2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
sum
{});
auto
batch_sum
=
block_reduce
_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
sum
{});
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
output
[
i
+
start
]
=
__float2half
(
__half2float
(
in_data
[
i
])
/
__half2float
(
batch_sum
));
...
...
@@ -161,7 +161,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
int
block_num
=
batch_shape
.
elements
();
int
shared_size
=
batch_item_num
*
2
*
result
.
get_shape
().
type_size
();
half2_block_size
=
half2_block_size
/
4
;
softmax_kernel
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
softmax_kernel
_half2
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
arg
.
data
(),
batch_item_num
,
half2_block_size
,
result
.
data
());
}
else
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment