Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
5a9bb616
"host/host_tensor/include/host_conv.hpp" did not exist on "71d6b19d18e267bb6b8e04711bc37e241aaed55e"
Commit
5a9bb616
authored
Jun 25, 2019
by
Shucai Xiao
Browse files
add std namespace for size_t
parents
f8fa90bd
22500e6c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
76 additions
and
65 deletions
+76
-65
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+20
-20
src/targets/gpu/device/logsoftmax.cpp
src/targets/gpu/device/logsoftmax.cpp
+12
-11
src/targets/gpu/device/softmax.cpp
src/targets/gpu/device/softmax.cpp
+16
-15
src/targets/gpu/include/migraphx/gpu/device/reduce_opers.hpp
src/targets/gpu/include/migraphx/gpu/device/reduce_opers.hpp
+28
-19
No files found.
src/targets/cpu/lowering.cpp
View file @
5a9bb616
...
@@ -533,7 +533,7 @@ struct cpu_softmax
...
@@ -533,7 +533,7 @@ struct cpu_softmax
{
{
argument
result
{
output_shape
};
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
auto
batch_lens
=
output_shape
.
lens
();
size_t
n_dims
=
batch_lens
[
op
.
axis
];
std
::
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
@@ -544,26 +544,26 @@ struct cpu_softmax
...
@@ -544,26 +544,26 @@ struct cpu_softmax
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
batch_shape
.
multi
(
i
);
auto
idx
=
batch_shape
.
multi
(
i
);
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
size_t
index
=
output_shape
.
index
(
idx
);
std
::
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
std
::
exp
(
input
[
index
]
-
batch_max
[
i
]);
output
[
index
]
=
std
::
exp
(
input
[
index
]
-
batch_max
[
i
]);
}
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
batch_sum
[
i
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
}
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
i
];
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
i
];
...
@@ -591,7 +591,7 @@ struct cpu_logsoftmax
...
@@ -591,7 +591,7 @@ struct cpu_logsoftmax
{
{
argument
result
{
output_shape
};
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
auto
batch_lens
=
output_shape
.
lens
();
size_t
n_dims
=
batch_lens
[
op
.
axis
];
std
::
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
@@ -605,20 +605,20 @@ struct cpu_logsoftmax
...
@@ -605,20 +605,20 @@ struct cpu_logsoftmax
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
batch_shape
.
multi
(
i
);
auto
idx
=
batch_shape
.
multi
(
i
);
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
size_t
index
=
output_shape
.
index
(
idx
);
std
::
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
input
[
index
]
-
batch_max
[
i
];
output
[
index
]
=
input
[
index
]
-
batch_max
[
i
];
}
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
batch_sum
[
i
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
...
@@ -626,7 +626,7 @@ struct cpu_logsoftmax
...
@@ -626,7 +626,7 @@ struct cpu_logsoftmax
batch_sum
[
i
]
=
std
::
log
(
batch_sum
[
i
]);
batch_sum
[
i
]
=
std
::
log
(
batch_sum
[
i
]);
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
for
(
std
::
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
{
idx
[
op
.
axis
]
=
j
;
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
i
];
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
i
];
...
@@ -655,7 +655,7 @@ struct cpu_argmax
...
@@ -655,7 +655,7 @@ struct cpu_argmax
{
{
argument
result
{
output_shape
};
argument
result
{
output_shape
};
auto
batch_lens
=
args
.
front
().
get_shape
().
lens
();
auto
batch_lens
=
args
.
front
().
get_shape
().
lens
();
size_t
batch_item_num
=
batch_lens
[
op
.
axis
];
std
::
size_t
batch_item_num
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
@@ -665,7 +665,7 @@ struct cpu_argmax
...
@@ -665,7 +665,7 @@ struct cpu_argmax
auto
data_idx
=
batch_shape
.
multi
(
i
);
auto
data_idx
=
batch_shape
.
multi
(
i
);
auto
max_val
=
input
[
i
];
auto
max_val
=
input
[
i
];
int64_t
max_index
=
0
;
int64_t
max_index
=
0
;
for
(
size_t
j
=
1
;
j
<
batch_item_num
;
++
j
)
for
(
std
::
size_t
j
=
1
;
j
<
batch_item_num
;
++
j
)
{
{
data_idx
[
op
.
axis
]
=
j
;
data_idx
[
op
.
axis
]
=
j
;
if
(
max_val
<
input
(
data_idx
.
begin
(),
data_idx
.
end
()))
if
(
max_val
<
input
(
data_idx
.
begin
(),
data_idx
.
end
()))
...
@@ -701,7 +701,7 @@ struct cpu_argmin
...
@@ -701,7 +701,7 @@ struct cpu_argmin
{
{
argument
result
{
output_shape
};
argument
result
{
output_shape
};
auto
batch_lens
=
args
.
front
().
get_shape
().
lens
();
auto
batch_lens
=
args
.
front
().
get_shape
().
lens
();
size_t
batch_item_num
=
batch_lens
[
op
.
axis
];
std
::
size_t
batch_item_num
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
@@ -711,7 +711,7 @@ struct cpu_argmin
...
@@ -711,7 +711,7 @@ struct cpu_argmin
auto
data_idx
=
batch_shape
.
multi
(
i
);
auto
data_idx
=
batch_shape
.
multi
(
i
);
auto
min_val
=
input
[
i
];
auto
min_val
=
input
[
i
];
int64_t
min_index
=
0
;
int64_t
min_index
=
0
;
for
(
size_t
j
=
1
;
j
<
batch_item_num
;
++
j
)
for
(
std
::
size_t
j
=
1
;
j
<
batch_item_num
;
++
j
)
{
{
data_idx
[
op
.
axis
]
=
j
;
data_idx
[
op
.
axis
]
=
j
;
if
(
min_val
>
input
(
data_idx
.
begin
(),
data_idx
.
end
()))
if
(
min_val
>
input
(
data_idx
.
begin
(),
data_idx
.
end
()))
...
...
src/targets/gpu/device/logsoftmax.cpp
View file @
5a9bb616
...
@@ -23,26 +23,27 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
...
@@ -23,26 +23,27 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
// use one block for items in one batch.
// use one block for items in one batch.
const
size_t
max_block_size
=
1024
;
const
std
::
size_t
max_block_size
=
1024
;
size_t
block_size
=
1
;
std
::
size_t
block_size
=
1
;
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
{
{
block_size
*=
2
;
block_size
*=
2
;
}
}
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr_idx
=
idx
.
local
;
std
::
size_t
thr_idx
=
idx
.
local
;
size_t
blk_idx
=
idx
.
group
;
std
::
size_t
blk_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
data_idx
=
batch_idx
;
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
// load data to lds and compute the batch max
size_t
remaining_item_num
=
batch_item_num
;
std
::
size_t
remaining_item_num
=
batch_item_num
;
size_t
round_item_num
=
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
std
::
size_t
round_item_num
=
lds_data
[
max_block_size
]
=
input
[
0
];
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
lds_data
[
max_block_size
]
=
input
[
0
];
for
(
std
::
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
{
if
(
i
<
batch_item_num
)
if
(
i
<
batch_item_num
)
{
{
...
@@ -62,7 +63,7 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
...
@@ -62,7 +63,7 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
lds_data
[
max_block_size
]
=
0
;
lds_data
[
max_block_size
]
=
0
;
remaining_item_num
=
batch_item_num
;
remaining_item_num
=
batch_item_num
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
{
if
(
i
<
batch_item_num
)
if
(
i
<
batch_item_num
)
{
{
...
@@ -81,7 +82,7 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
...
@@ -81,7 +82,7 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
auto
log_batch_sum
=
::
log
(
to_hip_type
(
lds_data
[
max_block_size
]))
+
batch_max
;
auto
log_batch_sum
=
::
log
(
to_hip_type
(
lds_data
[
max_block_size
]))
+
batch_max
;
for
(
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
{
{
data_idx
[
axis
]
=
i
;
data_idx
[
axis
]
=
i
;
output
[
data_idx
]
=
input
[
data_idx
]
-
log_batch_sum
;
output
[
data_idx
]
=
input
[
data_idx
]
-
log_batch_sum
;
...
...
src/targets/gpu/device/softmax.cpp
View file @
5a9bb616
...
@@ -15,34 +15,35 @@ namespace device {
...
@@ -15,34 +15,35 @@ namespace device {
void
softmax
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int
axis
)
void
softmax
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int
axis
)
{
{
auto
lens
=
result
.
get_shape
().
lens
();
auto
lens
=
result
.
get_shape
().
lens
();
auto
batch_lens
=
lens
;
auto
batch_lens
=
lens
;
size_t
batch_item_num
=
lens
[
axis
];
std
::
size_t
batch_item_num
=
lens
[
axis
];
batch_lens
[
axis
]
=
1
;
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
result
.
get_shape
().
type
(),
batch_lens
};
migraphx
::
shape
batch_shape
{
result
.
get_shape
().
type
(),
batch_lens
};
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
// use one block for items in one batch.
// use one block for items in one batch.
const
size_t
max_block_size
=
1024
;
const
std
::
size_t
max_block_size
=
1024
;
size_t
block_size
=
1
;
std
::
size_t
block_size
=
1
;
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
{
{
block_size
*=
2
;
block_size
*=
2
;
}
}
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr_idx
=
idx
.
local
;
std
::
size_t
thr_idx
=
idx
.
local
;
size_t
blk_idx
=
idx
.
group
;
std
::
size_t
blk_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
data_idx
=
batch_idx
;
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
// load data to lds and compute the batch max
size_t
remaining_item_num
=
batch_item_num
;
std
::
size_t
remaining_item_num
=
batch_item_num
;
size_t
round_item_num
=
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
std
::
size_t
round_item_num
=
lds_data
[
max_block_size
]
=
input
[
0
];
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
lds_data
[
max_block_size
]
=
input
[
0
];
for
(
std
::
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
{
if
(
i
<
batch_item_num
)
if
(
i
<
batch_item_num
)
{
{
...
@@ -63,7 +64,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
...
@@ -63,7 +64,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
lds_data
[
max_block_size
]
=
0
;
lds_data
[
max_block_size
]
=
0
;
remaining_item_num
=
batch_item_num
;
remaining_item_num
=
batch_item_num
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
{
if
(
i
<
batch_item_num
)
if
(
i
<
batch_item_num
)
{
{
...
@@ -81,7 +82,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
...
@@ -81,7 +82,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
}
}
auto
batch_sum
=
lds_data
[
max_block_size
];
auto
batch_sum
=
lds_data
[
max_block_size
];
for
(
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
{
{
data_idx
[
axis
]
=
i
;
data_idx
[
axis
]
=
i
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
...
...
src/targets/gpu/include/migraphx/gpu/device/reduce_opers.hpp
View file @
5a9bb616
...
@@ -11,14 +11,17 @@ namespace gpu {
...
@@ -11,14 +11,17 @@ namespace gpu {
namespace
device
{
namespace
device
{
template
<
class
T
>
template
<
class
T
>
inline
__device__
void
inline
__device__
void
reduce_max
(
T
*
data_ptr
,
reduce_max
(
T
*
data_ptr
,
size_t
block_size
,
size_t
thr_idx
,
size_t
item_num
,
size_t
max_index
)
std
::
size_t
block_size
,
std
::
size_t
thr_idx
,
std
::
size_t
item_num
,
std
::
size_t
max_index
)
{
{
while
(
true
)
while
(
true
)
{
{
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
size
=
item_num
/
2
;
auto
size
=
item_num
/
2
;
for
(
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
{
{
data_ptr
[
i
]
=
::
max
(
to_hip_type
(
data_ptr
[
i
]),
to_hip_type
(
data_ptr
[
i
+
stride
]));
data_ptr
[
i
]
=
::
max
(
to_hip_type
(
data_ptr
[
i
]),
to_hip_type
(
data_ptr
[
i
+
stride
]));
}
}
...
@@ -39,14 +42,17 @@ reduce_max(T* data_ptr, size_t block_size, size_t thr_idx, size_t item_num, size
...
@@ -39,14 +42,17 @@ reduce_max(T* data_ptr, size_t block_size, size_t thr_idx, size_t item_num, size
}
}
template
<
class
T
>
template
<
class
T
>
inline
__device__
void
inline
__device__
void
reduce_min
(
T
*
data_ptr
,
reduce_min
(
T
*
data_ptr
,
size_t
block_size
,
size_t
thr_idx
,
size_t
item_num
,
size_t
min_index
)
std
::
size_t
block_size
,
std
::
size_t
thr_idx
,
std
::
size_t
item_num
,
std
::
size_t
min_index
)
{
{
while
(
true
)
while
(
true
)
{
{
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
size
=
item_num
/
2
;
auto
size
=
item_num
/
2
;
for
(
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
{
{
data_ptr
[
i
]
=
::
min
(
to_hip_type
(
data_ptr
[
i
]),
to_hip_type
(
data_ptr
[
i
+
stride
]));
data_ptr
[
i
]
=
::
min
(
to_hip_type
(
data_ptr
[
i
]),
to_hip_type
(
data_ptr
[
i
+
stride
]));
}
}
...
@@ -69,16 +75,16 @@ reduce_min(T* data_ptr, size_t block_size, size_t thr_idx, size_t item_num, size
...
@@ -69,16 +75,16 @@ reduce_min(T* data_ptr, size_t block_size, size_t thr_idx, size_t item_num, size
template
<
class
T
>
template
<
class
T
>
inline
__device__
void
reduce_argmax
(
T
*
data_ptr
,
inline
__device__
void
reduce_argmax
(
T
*
data_ptr
,
int64_t
*
index_ptr
,
int64_t
*
index_ptr
,
size_t
block_size
,
std
::
size_t
block_size
,
size_t
thr_idx
,
std
::
size_t
thr_idx
,
size_t
item_num
,
std
::
size_t
item_num
,
size_t
max_index
)
std
::
size_t
max_index
)
{
{
while
(
true
)
while
(
true
)
{
{
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
size
=
item_num
/
2
;
auto
size
=
item_num
/
2
;
for
(
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
{
{
if
(
data_ptr
[
i
]
<
data_ptr
[
i
+
stride
])
if
(
data_ptr
[
i
]
<
data_ptr
[
i
+
stride
])
{
{
...
@@ -108,16 +114,16 @@ inline __device__ void reduce_argmax(T* data_ptr,
...
@@ -108,16 +114,16 @@ inline __device__ void reduce_argmax(T* data_ptr,
template
<
class
T
>
template
<
class
T
>
inline
__device__
void
reduce_argmin
(
T
*
data_ptr
,
inline
__device__
void
reduce_argmin
(
T
*
data_ptr
,
int64_t
*
index_ptr
,
int64_t
*
index_ptr
,
size_t
block_size
,
std
::
size_t
block_size
,
size_t
thr_idx
,
std
::
size_t
thr_idx
,
size_t
item_num
,
std
::
size_t
item_num
,
size_t
min_index
)
std
::
size_t
min_index
)
{
{
while
(
true
)
while
(
true
)
{
{
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
size
=
item_num
/
2
;
auto
size
=
item_num
/
2
;
for
(
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
{
{
if
(
data_ptr
[
i
]
>
data_ptr
[
i
+
stride
])
if
(
data_ptr
[
i
]
>
data_ptr
[
i
+
stride
])
{
{
...
@@ -145,14 +151,17 @@ inline __device__ void reduce_argmin(T* data_ptr,
...
@@ -145,14 +151,17 @@ inline __device__ void reduce_argmin(T* data_ptr,
}
}
template
<
class
T
>
template
<
class
T
>
inline
__device__
void
inline
__device__
void
reduce_sum
(
T
*
data_ptr
,
reduce_sum
(
T
*
data_ptr
,
size_t
block_size
,
size_t
thr_idx
,
size_t
item_num
,
size_t
sum_index
)
std
::
size_t
block_size
,
std
::
size_t
thr_idx
,
std
::
size_t
item_num
,
std
::
size_t
sum_index
)
{
{
while
(
true
)
while
(
true
)
{
{
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
stride
=
(
item_num
+
1
)
/
2
;
auto
size
=
item_num
/
2
;
auto
size
=
item_num
/
2
;
for
(
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
for
(
std
::
size_t
i
=
thr_idx
;
i
<
size
;
i
+=
block_size
)
{
{
data_ptr
[
i
]
+=
data_ptr
[
i
+
stride
];
data_ptr
[
i
]
+=
data_ptr
[
i
+
stride
];
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment