Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
94e3a2e4
Commit
94e3a2e4
authored
Feb 12, 2022
by
Shucai Xiao
Browse files
change size_t to int
parent
26bd92d8
Changes
256
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
140 additions
and
140 deletions
+140
-140
src/targets/cpu/include/migraphx/cpu/dnnl.hpp
src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+5
-5
src/targets/cpu/include/migraphx/cpu/parallel.hpp
src/targets/cpu/include/migraphx/cpu/parallel.hpp
+14
-14
src/targets/cpu/include/migraphx/cpu/pointwise.hpp
src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+29
-29
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+18
-18
src/targets/cpu/pooling.cpp
src/targets/cpu/pooling.cpp
+8
-8
src/targets/gpu/analyze_streams.cpp
src/targets/gpu/analyze_streams.cpp
+8
-8
src/targets/gpu/batch_norm_inference.cpp
src/targets/gpu/batch_norm_inference.cpp
+2
-2
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+7
-7
src/targets/gpu/compile_hip_code_object.cpp
src/targets/gpu/compile_hip_code_object.cpp
+2
-2
src/targets/gpu/compile_ops.cpp
src/targets/gpu/compile_ops.cpp
+1
-1
src/targets/gpu/concat.cpp
src/targets/gpu/concat.cpp
+1
-1
src/targets/gpu/convolution.cpp
src/targets/gpu/convolution.cpp
+4
-4
src/targets/gpu/deconvolution.cpp
src/targets/gpu/deconvolution.cpp
+3
-3
src/targets/gpu/device/concat.cpp
src/targets/gpu/device/concat.cpp
+2
-2
src/targets/gpu/device/gather.cpp
src/targets/gpu/device/gather.cpp
+1
-1
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+10
-10
src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
...ets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
+1
-1
src/targets/gpu/device/int8_gemm_pack.cpp
src/targets/gpu/device/int8_gemm_pack.cpp
+14
-14
src/targets/gpu/device/layernorm.cpp
src/targets/gpu/device/layernorm.cpp
+7
-7
src/targets/gpu/device/multinomial.cpp
src/targets/gpu/device/multinomial.cpp
+3
-3
No files found.
src/targets/cpu/include/migraphx/cpu/dnnl.hpp
View file @
94e3a2e4
...
...
@@ -38,7 +38,7 @@ dnnl_context& get_dnnl_context();
dnnl
::
memory
::
data_type
to_dnnl_memory_data_type
(
shape
::
type_t
t
);
dnnl
::
memory
::
format_tag
to_dnnl_memory_format_tag
(
std
::
size_
t
n
);
dnnl
::
memory
::
format_tag
to_dnnl_memory_format_tag
(
in
t
n
);
template
<
class
R
>
inline
dnnl
::
memory
::
dims
to_dnnl_dims
(
R
&&
r
)
...
...
@@ -105,14 +105,14 @@ struct dnnl_op : auto_register_op<Derived>
return
{{
"group"
,
g
}};
}
std
::
size_
t
get_extra_post_op_args
()
const
in
t
get_extra_post_op_args
()
const
{
return
std
::
count_if
(
post_ops
.
begin
(),
post_ops
.
end
(),
[](
const
auto
&
po
)
{
return
contains
(
po
.
algo
,
"binary"
);
});
}
static
std
::
size_
t
get_binary_post_op_arg
(
std
::
size_
t
pos
)
static
in
t
get_binary_post_op_arg
(
in
t
pos
)
{
return
MIGRAPHX_DNNL_PREFIX
(
ARG_ATTR_MULTIPLE_POST_OP
)(
pos
)
|
// NOLINT
MIGRAPHX_DNNL_PREFIX
(
ARG_SRC_1
);
// NOLINT
...
...
@@ -154,7 +154,7 @@ struct dnnl_op : auto_register_op<Derived>
strides
.
end
(),
lens
.
begin
(),
lens
.
begin
(),
[](
auto
stride
,
auto
len
)
->
std
::
size_
t
{
[](
auto
stride
,
auto
len
)
->
in
t
{
if
(
stride
==
0
)
return
1
;
else
...
...
@@ -182,7 +182,7 @@ struct dnnl_op : auto_register_op<Derived>
}
}
shape
adjust_shape
(
const
shape
&
s
,
int
)
const
{
return
base_adjust_shape
(
s
);
}
std
::
vector
<
int
>
create_arg_map
(
std
::
size_
t
input_size
)
const
std
::
vector
<
int
>
create_arg_map
(
in
t
input_size
)
const
{
const
auto
&
self
=
static_cast
<
const
Derived
&>
(
*
this
);
auto
npost_ops
=
get_extra_post_op_args
();
...
...
src/targets/cpu/include/migraphx/cpu/parallel.hpp
View file @
94e3a2e4
...
...
@@ -16,14 +16,14 @@ namespace cpu {
#ifdef MIGRAPHX_DISABLE_OMP
inline
std
::
size_
t
max_threads
()
{
return
std
::
thread
::
hardware_concurrency
();
}
inline
in
t
max_threads
()
{
return
std
::
thread
::
hardware_concurrency
();
}
template
<
class
F
>
void
parallel_for_impl
(
std
::
size_t
n
,
std
::
size_
t
threadsize
,
F
f
)
void
parallel_for_impl
(
int
n
,
in
t
threadsize
,
F
f
)
{
if
(
threadsize
<=
1
)
{
f
(
std
::
size_
t
{
0
},
n
);
f
(
in
t
{
0
},
n
);
}
else
{
...
...
@@ -32,9 +32,9 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
#if(!defined(__GNUC__) || __GNUC__ != 5)
const
#endif
std
::
size_
t
grainsize
=
std
::
ceil
(
static_cast
<
double
>
(
n
)
/
threads
.
size
());
in
t
grainsize
=
std
::
ceil
(
static_cast
<
double
>
(
n
)
/
threads
.
size
());
std
::
size_
t
work
=
0
;
in
t
work
=
0
;
std
::
generate
(
threads
.
begin
(),
threads
.
end
(),
[
=
,
&
work
]
{
auto
result
=
joinable_thread
([
=
]()
mutable
{
f
(
work
,
std
::
min
(
n
,
work
+
grainsize
));
});
...
...
@@ -47,36 +47,36 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
}
#else
inline
std
::
size_
t
max_threads
()
{
return
omp_get_max_threads
();
}
inline
in
t
max_threads
()
{
return
omp_get_max_threads
();
}
template
<
class
F
>
void
parallel_for_impl
(
std
::
size_t
n
,
std
::
size_
t
threadsize
,
F
f
)
void
parallel_for_impl
(
int
n
,
in
t
threadsize
,
F
f
)
{
if
(
threadsize
<=
1
)
{
f
(
std
::
size_
t
{
0
},
n
);
f
(
in
t
{
0
},
n
);
}
else
{
std
::
size_
t
grainsize
=
std
::
ceil
(
static_cast
<
double
>
(
n
)
/
threadsize
);
in
t
grainsize
=
std
::
ceil
(
static_cast
<
double
>
(
n
)
/
threadsize
);
#pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
for
(
std
::
size_
t
tid
=
0
;
tid
<
threadsize
;
tid
++
)
for
(
in
t
tid
=
0
;
tid
<
threadsize
;
tid
++
)
{
std
::
size_
t
work
=
tid
*
grainsize
;
in
t
work
=
tid
*
grainsize
;
f
(
work
,
std
::
min
(
n
,
work
+
grainsize
));
}
}
}
#endif
template
<
class
F
>
void
parallel_for
(
std
::
size_t
n
,
std
::
size_
t
min_grain
,
F
f
)
void
parallel_for
(
int
n
,
in
t
min_grain
,
F
f
)
{
const
auto
threadsize
=
std
::
min
<
std
::
size_
t
>
(
max_threads
(),
n
/
min_grain
);
const
auto
threadsize
=
std
::
min
<
in
t
>
(
max_threads
(),
n
/
min_grain
);
parallel_for_impl
(
n
,
threadsize
,
f
);
}
template
<
class
F
>
void
parallel_for
(
std
::
size_
t
n
,
F
f
)
void
parallel_for
(
in
t
n
,
F
f
)
{
const
int
min_grain
=
8
;
parallel_for
(
n
,
min_grain
,
f
);
...
...
src/targets/cpu/include/migraphx/cpu/pointwise.hpp
View file @
94e3a2e4
...
...
@@ -16,26 +16,26 @@ struct multi_index
{
constexpr
multi_index
()
=
default
;
multi_index
(
const
shape
&
s
,
std
::
size_
t
i
)
:
n
(
s
.
lens
().
size
())
multi_index
(
const
shape
&
s
,
in
t
i
)
:
n
(
s
.
lens
().
size
())
{
assert
(
n
<
max_size
);
std
::
copy
(
s
.
lens
().
begin
(),
s
.
lens
().
end
(),
dims
);
s
.
multi_copy
(
i
,
index
,
index
+
max_size
);
}
constexpr
std
::
size_
t
size
()
const
{
return
n
;
}
constexpr
in
t
size
()
const
{
return
n
;
}
constexpr
std
::
size_
t
*
begin
()
{
return
index
;
}
constexpr
const
std
::
size_
t
*
begin
()
const
{
return
index
;
}
constexpr
in
t
*
begin
()
{
return
index
;
}
constexpr
const
in
t
*
begin
()
const
{
return
index
;
}
constexpr
std
::
size_
t
*
end
()
{
return
index
+
size
();
}
constexpr
const
std
::
size_
t
*
end
()
const
{
return
index
+
size
();
}
constexpr
in
t
*
end
()
{
return
index
+
size
();
}
constexpr
const
in
t
*
end
()
const
{
return
index
+
size
();
}
std
::
size_
t
offset
(
const
shape
&
s
)
const
{
return
s
.
index
(
begin
(),
end
());
}
in
t
offset
(
const
shape
&
s
)
const
{
return
s
.
index
(
begin
(),
end
());
}
constexpr
void
carry
()
{
std
::
size_
t
overflow
=
0
;
in
t
overflow
=
0
;
for
(
std
::
ptrdiff_t
i
=
size
()
-
1
;
i
>
0
;
i
--
)
{
auto
z
=
index
[
i
]
+
overflow
;
...
...
@@ -57,13 +57,13 @@ struct multi_index
index
[
0
]
+=
overflow
;
}
constexpr
void
increment
(
std
::
size_
t
i
)
constexpr
void
increment
(
in
t
i
)
{
index
[
size
()
-
1
]
+=
i
;
carry
();
}
constexpr
multi_index
&
operator
+=
(
std
::
size_
t
i
)
constexpr
multi_index
&
operator
+=
(
in
t
i
)
{
increment
(
i
);
return
*
this
;
...
...
@@ -82,10 +82,10 @@ struct multi_index
}
private:
static
const
std
::
size_
t
max_size
=
5
;
std
::
size_
t
index
[
max_size
]
=
{};
std
::
size_
t
dims
[
max_size
]
=
{};
std
::
size_
t
n
=
0
;
static
const
in
t
max_size
=
5
;
in
t
index
[
max_size
]
=
{};
in
t
dims
[
max_size
]
=
{};
in
t
n
=
0
;
};
struct
reduce_dims_base
...
...
@@ -97,7 +97,7 @@ struct reduce_dims_base
reduce_shapes
=
reduce_dims
(
inputs
);
}
argument
get_arg
(
const
std
::
vector
<
argument
>&
args
,
std
::
size_
t
i
)
const
argument
get_arg
(
const
std
::
vector
<
argument
>&
args
,
in
t
i
)
const
{
if
(
reduce_shapes
.
empty
())
return
args
[
i
];
...
...
@@ -111,7 +111,7 @@ struct reduce_dims_base
}
};
template
<
class
T
,
std
::
size_
t
N
>
template
<
class
T
,
in
t
N
>
struct
vec
{
using
array_type
=
std
::
array
<
T
,
N
>
;
...
...
@@ -126,19 +126,19 @@ struct vec
};
template
<
class
T
>
constexpr
std
::
integral_constant
<
std
::
size_
t
,
0
>
vec_size
(
const
T
&
)
constexpr
std
::
integral_constant
<
in
t
,
0
>
vec_size
(
const
T
&
)
{
return
{};
}
template
<
class
T
,
std
::
size_
t
N
>
constexpr
std
::
integral_constant
<
std
::
size_
t
,
N
>
vec_size
(
const
vec
<
T
,
N
>&
)
template
<
class
T
,
in
t
N
>
constexpr
std
::
integral_constant
<
in
t
,
N
>
vec_size
(
const
vec
<
T
,
N
>&
)
{
return
{};
}
template
<
class
T
>
constexpr
std
::
size_
t
vec_size
()
constexpr
in
t
vec_size
()
{
return
decltype
(
vec_size
(
std
::
declval
<
T
>
())){};
}
...
...
@@ -148,7 +148,7 @@ void vec_apply(F f, V& v, Vs... vs)
{
assert
(
all_of
({
vec_size
<
Vs
>
()...},
[
&
](
auto
n
)
{
return
n
==
vec_size
<
V
>
();
}));
assert
(
vec_size
<
V
>
()
==
v
.
array
.
size
());
for
(
std
::
size_
t
i
=
0
;
i
<
vec_size
<
V
>
();
i
++
)
for
(
in
t
i
=
0
;
i
<
vec_size
<
V
>
();
i
++
)
f
(
v
.
array
[
i
],
vs
.
vector
[
i
]...);
}
...
...
@@ -158,9 +158,9 @@ void vec_apply(F f, V& v, Vs&... vs)
f
(
v
,
vs
...);
}
inline
std
::
size_
t
find_packed_len
(
const
shape
&
s
)
inline
in
t
find_packed_len
(
const
shape
&
s
)
{
for
(
std
::
size_
t
i
=
0
;
i
<
s
.
lens
().
size
();
i
++
)
for
(
in
t
i
=
0
;
i
<
s
.
lens
().
size
();
i
++
)
{
if
(
s
.
lens
()[
i
]
>
1
and
s
.
strides
()[
i
]
==
1
)
{
...
...
@@ -170,7 +170,7 @@ inline std::size_t find_packed_len(const shape& s)
return
-
1
;
}
template
<
std
::
size_
t
N
>
template
<
in
t
N
>
shape
vectorize
(
const
shape
&
s
)
{
assert
(
s
.
standard
()
or
s
.
broadcasted
());
...
...
@@ -188,7 +188,7 @@ shape vectorize(const shape& s)
return
{
s
.
type
(),
lens
};
}
template
<
std
::
size_
t
N
,
class
T
>
template
<
in
t
N
,
class
T
>
tensor_view
<
vec
<
T
,
N
>>
vectorize
(
tensor_view
<
T
>
tv
)
{
return
{
vectorize
<
N
>
(
tv
.
get_shape
()),
reinterpret_cast
<
vec
<
T
,
N
>*>
(
tv
.
data
())};
...
...
@@ -209,7 +209,7 @@ struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...
{
};
template
<
std
::
size_
t
N
,
class
...
Xs
>
template
<
in
t
N
,
class
...
Xs
>
bool
is_vectorizable
(
const
Xs
&
...
xs
)
{
return
all_of
({
xs
...},
[](
const
auto
&
s
)
{
...
...
@@ -223,7 +223,7 @@ bool is_vectorizable(const Xs&... xs)
s
.
strides
().
begin
(),
0
,
std
::
plus
<>
{},
[
&
](
auto
len
,
auto
stride
)
->
std
::
size_
t
{
[
&
](
auto
len
,
auto
stride
)
->
in
t
{
if
(
stride
>
0
and
len
==
1
)
return
0
;
return
stride
;
...
...
@@ -272,7 +272,7 @@ bool is_standard_offset(const X& x, const Xs&... xs)
template
<
class
...
Ts
>
auto
pointwise_apply
(
Ts
...
ts
)
{
return
[
=
](
context
&
ctx
,
const
shape
&
base_shape
,
std
::
size_
t
min_grain
,
auto
f
)
mutable
{
return
[
=
](
context
&
ctx
,
const
shape
&
base_shape
,
in
t
min_grain
,
auto
f
)
mutable
{
if
(
is_standard_offset
(
ts
.
get_shape
()...))
{
ctx
.
bulk_execute
(
base_shape
.
elements
(),
min_grain
,
[
=
](
auto
start
,
auto
end
)
mutable
{
...
...
@@ -300,7 +300,7 @@ auto pointwise_apply(Ts... ts)
template
<
class
...
Ts
>
auto
pointwise
(
Ts
...
ts
)
{
return
[
=
](
context
&
ctx
,
const
shape
&
base_shape
,
std
::
size_
t
min_grain
,
auto
f
)
mutable
{
return
[
=
](
context
&
ctx
,
const
shape
&
base_shape
,
in
t
min_grain
,
auto
f
)
mutable
{
auto_vectorize
(
base_shape
,
ts
...)(
[
&
](
auto
bs
,
auto
...
xs
)
{
pointwise_apply
(
xs
...)(
ctx
,
bs
,
min_grain
,
f
);
});
};
...
...
src/targets/cpu/lowering.cpp
View file @
94e3a2e4
...
...
@@ -77,35 +77,35 @@ struct cpu_im2col
auto
input_shape
=
args
[
0
].
get_shape
();
auto
weights_shape
=
args
[
1
].
get_shape
();
visit_all
(
result
,
args
[
0
])([
&
](
auto
col
,
auto
input
)
{
const
std
::
size_
t
&
height
=
input_shape
.
lens
()[
2
];
const
std
::
size_
t
&
width
=
input_shape
.
lens
()[
3
];
const
std
::
size_
t
&
channels
=
weights_shape
.
lens
()[
1
];
const
std
::
size_
t
&
kernel_h
=
weights_shape
.
lens
()[
2
];
const
std
::
size_
t
&
kernel_w
=
weights_shape
.
lens
()[
3
];
const
std
::
size_
t
&
pad_h
=
op
.
padding
[
0
];
const
std
::
size_
t
&
pad_w
=
op
.
padding
[
1
];
const
std
::
size_
t
&
stride_h
=
op
.
stride
[
0
];
const
std
::
size_
t
&
stride_w
=
op
.
stride
[
1
];
const
in
t
&
height
=
input_shape
.
lens
()[
2
];
const
in
t
&
width
=
input_shape
.
lens
()[
3
];
const
in
t
&
channels
=
weights_shape
.
lens
()[
1
];
const
in
t
&
kernel_h
=
weights_shape
.
lens
()[
2
];
const
in
t
&
kernel_w
=
weights_shape
.
lens
()[
3
];
const
in
t
&
pad_h
=
op
.
padding
[
0
];
const
in
t
&
pad_w
=
op
.
padding
[
1
];
const
in
t
&
stride_h
=
op
.
stride
[
0
];
const
in
t
&
stride_w
=
op
.
stride
[
1
];
long
kdiv2_h
=
long
(
kernel_h
)
/
2
;
long
kdiv2_w
=
long
(
kernel_w
)
/
2
;
// calculate output sizes
const
std
::
size_
t
col_height
=
(
height
-
kernel_h
+
2
*
pad_h
)
/
stride_h
+
1
;
const
std
::
size_
t
col_width
=
(
width
-
kernel_w
+
2
*
pad_w
)
/
stride_w
+
1
;
const
in
t
col_height
=
(
height
-
kernel_h
+
2
*
pad_h
)
/
stride_h
+
1
;
const
in
t
col_width
=
(
width
-
kernel_w
+
2
*
pad_w
)
/
stride_w
+
1
;
// account for padding for the starting position of the input pixels
long
iinput
=
kdiv2_h
-
long
(
pad_h
);
// loop over output pixels (ioutput, joutput)
for
(
std
::
size_
t
ioutput
=
0
;
ioutput
<
col_height
;
ioutput
++
,
iinput
+=
stride_h
)
for
(
in
t
ioutput
=
0
;
ioutput
<
col_height
;
ioutput
++
,
iinput
+=
stride_h
)
{
long
jinput
=
kdiv2_w
-
long
(
pad_w
);
for
(
std
::
size_
t
joutput
=
0
;
joutput
<
col_width
;
joutput
++
,
jinput
+=
stride_w
)
for
(
in
t
joutput
=
0
;
joutput
<
col_width
;
joutput
++
,
jinput
+=
stride_w
)
{
// compute linear index for output
std
::
size_
t
ldx
=
ioutput
*
col_width
+
joutput
;
std
::
size_
t
p
=
0
;
in
t
ldx
=
ioutput
*
col_width
+
joutput
;
in
t
p
=
0
;
dfor
(
channels
,
kernel_h
,
kernel_w
)([
&
](
std
::
size_t
c
,
std
::
size_
t
koffset
,
std
::
size_
t
loffset
)
{
kernel_w
)([
&
](
int
c
,
in
t
koffset
,
in
t
loffset
)
{
auto
idx
=
iinput
+
long
(
koffset
)
-
kdiv2_h
;
auto
jdx
=
jinput
+
long
(
loffset
)
-
kdiv2_w
;
col
(
ldx
,
p
)
=
((
idx
>=
0
)
&&
(
idx
<
height
)
&&
(
jdx
>=
0
)
&&
(
jdx
<
width
))
...
...
@@ -177,7 +177,7 @@ struct cpu_pad
visit_all
(
result
,
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
shape_for_each
(
input
.
get_shape
(),
[
&
](
const
auto
&
idx
)
{
std
::
vector
<
std
::
size_
t
>
new_idx
(
idx
.
size
());
std
::
vector
<
in
t
>
new_idx
(
idx
.
size
());
std
::
transform
(
idx
.
begin
(),
idx
.
end
(),
op
.
pads
.
begin
(),
new_idx
.
begin
(),
[](
auto
i
,
auto
j
)
{
return
i
+
j
;
...
...
@@ -307,7 +307,7 @@ struct cpu_apply
outputs_alias
.
begin
(),
[](
const
auto
&
i
)
{
return
instruction
::
get_output_alias
(
i
);
});
std
::
size_
t
index
=
0
;
in
t
index
=
0
;
for
(
auto
ins
:
outputs_alias
)
{
prog_output_names
[
ins
]
=
modl
->
name
()
+
":#output_"
+
std
::
to_string
(
index
++
);
...
...
src/targets/cpu/pooling.cpp
View file @
94e3a2e4
...
...
@@ -26,7 +26,7 @@ struct max_pool
return
(
m
);
}
static
double
final
(
double
x
,
std
::
size_
t
)
{
return
(
x
);
}
static
double
final
(
double
x
,
in
t
)
{
return
(
x
);
}
};
struct
avg_pool
...
...
@@ -41,7 +41,7 @@ struct avg_pool
static
double
apply
(
double
x
,
double
y
)
{
return
x
+
y
;
}
static
double
final
(
double
x
,
std
::
size_
t
y
)
{
return
(
y
==
0
)
?
0.0
:
(
x
/
y
);
}
static
double
final
(
double
x
,
in
t
y
)
{
return
(
y
==
0
)
?
0.0
:
(
x
/
y
);
}
};
template
<
class
Op
>
...
...
@@ -77,14 +77,14 @@ struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
using
type
=
typename
decltype
(
output
)
::
value_type
;
auto
in_s
=
input
.
get_shape
();
auto
in_lens
=
in_s
.
lens
();
std
::
vector
<
std
::
size_
t
>
vec_len
(
in_lens
.
begin
()
+
2
,
in_lens
.
end
());
std
::
vector
<
in
t
>
vec_len
(
in_lens
.
begin
()
+
2
,
in_lens
.
end
());
par_for
(
output_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx_o
=
output_shape
.
multi
(
i
);
auto
n_dim
=
idx_o
.
size
();
std
::
vector
<
std
::
size_
t
>
win_start
;
std
::
vector
<
std
::
size_
t
>
win_size
;
for
(
std
::
size_
t
dim
=
2
;
dim
<
n_dim
;
++
dim
)
std
::
vector
<
in
t
>
win_start
;
std
::
vector
<
in
t
>
win_size
;
for
(
in
t
dim
=
2
;
dim
<
n_dim
;
++
dim
)
{
auto
d_2
=
dim
-
2
;
int
start
=
static_cast
<
int
>
(
idx_o
[
dim
]
*
op
.
stride
[
d_2
])
-
...
...
@@ -131,8 +131,8 @@ struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::po
{
auto
algo
=
op
.
mode
==
"max"
?
dnnl
::
algorithm
::
pooling_max
:
dnnl
::
algorithm
::
pooling_avg
;
auto
kdims
=
op
.
kdims
();
std
::
vector
<
size_
t
>
padding_l
(
op
.
padding
.
begin
(),
op
.
padding
.
begin
()
+
kdims
);
std
::
vector
<
size_
t
>
padding_r
(
op
.
padding
.
begin
()
+
kdims
,
op
.
padding
.
end
());
std
::
vector
<
in
t
>
padding_l
(
op
.
padding
.
begin
(),
op
.
padding
.
begin
()
+
kdims
);
std
::
vector
<
in
t
>
padding_r
(
op
.
padding
.
begin
()
+
kdims
,
op
.
padding
.
end
());
return
{
dnnl
::
prop_kind
::
forward_inference
,
algo
,
m
.
at
(
MIGRAPHX_DNNL_PREFIX
(
ARG_SRC
)),
...
...
src/targets/gpu/analyze_streams.cpp
View file @
94e3a2e4
...
...
@@ -11,14 +11,14 @@ namespace gpu {
struct
hip_stream_model
{
std
::
size_
t
max_stream
=
0
;
std
::
unordered_map
<
migraphx
::
instruction_ref
,
std
::
size_
t
>
ins2stream
{};
std
::
size_
t
get_nstream
()
const
{
return
max_stream
+
1
;
}
std
::
size_
t
get_stream
(
migraphx
::
instruction_ref
ins
)
const
{
return
ins2stream
.
at
(
ins
);
}
std
::
size_
t
get_event_id
(
migraphx
::
instruction_ref
ins
)
const
in
t
max_stream
=
0
;
std
::
unordered_map
<
migraphx
::
instruction_ref
,
in
t
>
ins2stream
{};
in
t
get_nstream
()
const
{
return
max_stream
+
1
;
}
in
t
get_stream
(
migraphx
::
instruction_ref
ins
)
const
{
return
ins2stream
.
at
(
ins
);
}
in
t
get_event_id
(
migraphx
::
instruction_ref
ins
)
const
{
auto
v
=
ins
->
get_operator
().
to_value
();
return
v
[
"event"
].
to
<
std
::
size_
t
>
();
return
v
[
"event"
].
to
<
in
t
>
();
}
bool
has_stream
(
migraphx
::
instruction_ref
ins
)
const
{
return
ins2stream
.
count
(
ins
)
>
0
;
}
bool
is_record
(
migraphx
::
instruction_ref
ins
)
const
...
...
@@ -31,13 +31,13 @@ struct hip_stream_model
stream_model
make_stream_model
(
const
module
&
p
)
{
hip_stream_model
m
;
std
::
size_
t
stream
=
0
;
in
t
stream
=
0
;
for
(
auto
ins
:
iterator_for
(
p
))
{
if
(
ins
->
name
()
==
"gpu::set_stream"
)
{
auto
v
=
ins
->
get_operator
().
to_value
();
stream
=
v
[
"stream"
].
to
<
std
::
size_
t
>
();
stream
=
v
[
"stream"
].
to
<
in
t
>
();
m
.
max_stream
=
std
::
max
(
stream
,
m
.
max_stream
);
}
if
(
ins
->
get_operator
().
is_context_free
())
...
...
src/targets/gpu/batch_norm_inference.cpp
View file @
94e3a2e4
...
...
@@ -18,8 +18,8 @@ inline shape reshape_to_2d(const shape& input)
if
(
dims
.
size
()
>=
4
)
return
input
;
std
::
vector
<
size_
t
>
new_dims
(
dims
.
begin
(),
dims
.
end
());
std
::
size_
t
num
=
4
-
dims
.
size
();
std
::
vector
<
in
t
>
new_dims
(
dims
.
begin
(),
dims
.
end
());
in
t
num
=
4
-
dims
.
size
();
new_dims
.
insert
(
new_dims
.
end
(),
num
,
1
);
return
{
input
.
type
(),
new_dims
};
}
...
...
src/targets/gpu/compile_hip.cpp
View file @
94e3a2e4
...
...
@@ -67,7 +67,7 @@ struct hiprtc_program
string_array
()
{}
string_array
(
const
string_array
&
)
=
delete
;
std
::
size_
t
size
()
const
{
return
strings
.
size
();
}
in
t
size
()
const
{
return
strings
.
size
();
}
const
char
**
data
()
{
return
c_strs
.
data
();
}
...
...
@@ -125,7 +125,7 @@ struct hiprtc_program
std
::
string
log
()
{
std
::
size_
t
n
=
0
;
in
t
n
=
0
;
MIGRAPHX_HIPRTC
(
hiprtcGetProgramLogSize
(
prog
.
get
(),
&
n
));
if
(
n
<
2
)
return
{};
...
...
@@ -137,7 +137,7 @@ struct hiprtc_program
std
::
vector
<
char
>
get_code_obj
()
{
std
::
size_
t
n
=
0
;
in
t
n
=
0
;
MIGRAPHX_HIPRTC
(
hiprtcGetCodeSize
(
prog
.
get
(),
&
n
));
std
::
vector
<
char
>
buffer
(
n
);
MIGRAPHX_HIPRTC
(
hiprtcGetCode
(
prog
.
get
(),
buffer
.
data
()));
...
...
@@ -231,17 +231,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
return
{
compiler
.
compile
(
srcs
)};
}
std
::
string
enum_params
(
std
::
size_
t
count
,
std
::
string
param
)
std
::
string
enum_params
(
in
t
count
,
std
::
string
param
)
{
std
::
vector
<
std
::
string
>
items
(
count
);
transform
(
range
(
count
),
items
.
begin
(),
[
&
](
auto
i
)
{
return
param
+
std
::
to_string
(
i
);
});
return
join_strings
(
items
,
","
);
}
std
::
size_
t
compute_global
(
std
::
size_t
n
,
std
::
size_
t
local
)
in
t
compute_global
(
int
n
,
in
t
local
)
{
std
::
size_
t
groups
=
(
n
+
local
-
1
)
/
local
;
std
::
size_
t
nglobal
=
std
::
min
<
std
::
size_
t
>
(
256
,
groups
)
*
local
;
in
t
groups
=
(
n
+
local
-
1
)
/
local
;
in
t
nglobal
=
std
::
min
<
in
t
>
(
256
,
groups
)
*
local
;
return
nglobal
;
}
...
...
src/targets/gpu/compile_hip_code_object.cpp
View file @
94e3a2e4
...
...
@@ -35,7 +35,7 @@ struct make_tensor<${n}>
};
)__migraphx__"
;
std
::
string
generate_make_tensor
(
std
::
size_
t
n
,
const
shape
&
s
)
std
::
string
generate_make_tensor
(
in
t
n
,
const
shape
&
s
)
{
return
interpolate_string
(
make_tensor_template
,
{{
"n"
,
std
::
to_string
(
n
)},
...
...
@@ -47,7 +47,7 @@ std::string generate_make_tensor(std::size_t n, const shape& s)
std
::
string
generate_args_hpp
(
const
std
::
vector
<
shape
>&
inputs
)
{
std
::
string
inner
;
for
(
std
::
size_
t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
for
(
in
t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
inner
+=
generate_make_tensor
(
i
,
inputs
[
i
]);
}
...
...
src/targets/gpu/compile_ops.cpp
View file @
94e3a2e4
...
...
@@ -73,7 +73,7 @@ struct compiled_result
};
template
<
class
F
>
void
par_compile
(
std
::
size_
t
n
,
F
f
)
void
par_compile
(
in
t
n
,
F
f
)
{
if
(
n
==
0
)
return
;
...
...
src/targets/gpu/concat.cpp
View file @
94e3a2e4
...
...
@@ -16,7 +16,7 @@ argument hip_concat::compute(context& ctx,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
std
::
vector
<
std
::
size_
t
>
offsets
=
op
.
compute_offsets
(
output_shape
,
args
);
std
::
vector
<
in
t
>
offsets
=
op
.
compute_offsets
(
output_shape
,
args
);
return
device
::
concat
(
ctx
.
get_stream
().
get
(),
output_shape
,
args
,
offsets
);
}
...
...
src/targets/gpu/convolution.cpp
View file @
94e3a2e4
...
...
@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)
if
(
dims
.
size
()
==
3
)
{
std
::
vector
<
size_
t
>
new_dims
=
dims
;
std
::
vector
<
in
t
>
new_dims
=
dims
;
new_dims
.
insert
(
new_dims
.
begin
()
+
2
,
1
);
new_shape
=
shape
{
input
.
type
(),
new_dims
};
}
...
...
@@ -71,7 +71,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
cd
.
get
(),
y_desc
.
get
(),
&
workspace_size
);
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
workspace_shape
=
shape
{
shape
::
int8_type
,
{
static_cast
<
int
>
(
workspace_size
)
}};
auto
x
=
to_gpu
(
generate_argument
(
inputs
[
0
]));
auto
w
=
to_gpu
(
generate_argument
(
inputs
[
1
]));
...
...
@@ -98,7 +98,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
MIGRAPHX_THROW
(
"MIOpen Convolution: find convolution failed"
);
algo
=
perf
.
fwd_algo
;
size_t
solution_count
;
std
::
size_t
solution_count
;
status
=
miopenConvolutionForwardGetSolutionCount
(
ctx
.
get_stream
().
get_miopen
(),
w_desc
.
get
(),
...
...
@@ -124,7 +124,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
solution_id
=
solutions
.
front
().
solution_id
;
return
shape
{
shape
::
int8_type
,
{
perf
.
memory
}};
return
shape
{
shape
::
int8_type
,
{
static_cast
<
int
>
(
perf
.
memory
)
}};
}
void
miopen_convolution
::
finalize
(
context
&
ctx
,
...
...
src/targets/gpu/deconvolution.cpp
View file @
94e3a2e4
...
...
@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)
if
(
dims
.
size
()
==
3
)
{
std
::
vector
<
size_
t
>
new_dims
=
dims
;
std
::
vector
<
in
t
>
new_dims
=
dims
;
new_dims
.
insert
(
new_dims
.
begin
()
+
2
,
1
);
new_shape
=
shape
{
input
.
type
(),
new_dims
};
}
...
...
@@ -72,7 +72,7 @@ shape miopen_deconvolution::compile(context& ctx,
cd
.
get
(),
y_desc
.
get
(),
&
workspace_size
);
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
workspace_shape
=
shape
{
shape
::
int8_type
,
{
static_cast
<
int
>
(
workspace_size
)
}};
auto
x
=
to_gpu
(
generate_argument
(
inputs
[
0
]));
auto
w
=
to_gpu
(
generate_argument
(
inputs
[
1
]));
...
...
@@ -99,7 +99,7 @@ shape miopen_deconvolution::compile(context& ctx,
MIGRAPHX_THROW
(
"Find deconvolution failed"
);
handle
=
ctx
.
get_stream
().
get_miopen
();
algo
=
perf
.
fwd_algo
;
return
shape
{
shape
::
int8_type
,
{
perf
.
memory
}};
return
shape
{
shape
::
int8_type
,
{
static_cast
<
int
>
(
perf
.
memory
)
}};
}
void
miopen_deconvolution
::
finalize
(
context
&
ctx
,
...
...
src/targets/gpu/device/concat.cpp
View file @
94e3a2e4
...
...
@@ -11,10 +11,10 @@ namespace device {
argument
concat
(
hipStream_t
stream
,
const
migraphx
::
shape
&
,
std
::
vector
<
migraphx
::
argument
>
args
,
std
::
vector
<
std
::
size_
t
>
offsets
)
std
::
vector
<
in
t
>
offsets
)
{
auto
ninputs
=
args
.
size
()
-
1
;
for
(
std
::
size_
t
j
=
0
;
j
<
ninputs
;
j
++
)
for
(
in
t
j
=
0
;
j
<
ninputs
;
j
++
)
{
auto
&&
arg
=
args
[
j
];
auto
offset
=
offsets
[
j
];
...
...
src/targets/gpu/device/gather.cpp
View file @
94e3a2e4
...
...
@@ -17,7 +17,7 @@ argument gather(hipStream_t stream, argument result, argument arg1, argument arg
auto
axis_dim_size
=
lens
[
axis
];
lens
[
axis
]
=
arg2
.
get_shape
().
elements
();
shape
out_comp_shape
{
result
.
get_shape
().
type
(),
lens
};
std
::
size_
t
nelements
=
result
.
get_shape
().
elements
();
in
t
nelements
=
result
.
get_shape
().
elements
();
visit_all
(
result
,
arg1
)([
&
](
auto
output
,
auto
input_v
)
{
hip_visit_views
(
input_v
,
out_comp_shape
)([
&
](
auto
input
,
auto
out_comp
)
{
...
...
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
View file @
94e3a2e4
...
...
@@ -96,14 +96,14 @@ void nary_broadcast_vec_impl(
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPHX_DEVICE_SHARED
type
buffer
[
2048
/
vec_size
];
// Load bias into LDS
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
{
buffer
[
i
]
=
binput
.
data
()[
i
];
}
__syncthreads
();
auto
*
bp
=
as_pointer
(
buffer
);
// Process the data
for
(
size_
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
for
(
in
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
{
auto
bidx
=
broadcast_idx
(
i
*
vec_size
);
auto
b
=
bp
[
bidx
];
...
...
@@ -141,13 +141,13 @@ void nary_broadcast_impl(hipStream_t stream, F f, argument result, argument barg
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPHX_DEVICE_SHARED
type
buffer
[
2048
];
// Load bias into LDS
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
{
buffer
[
i
]
=
binput
.
data
()[
i
];
}
__syncthreads
();
// Process the data
for
(
size_
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
for
(
in
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
{
auto
bidx
=
broadcast_idx
(
i
);
auto
b
=
buffer
[
bidx
];
...
...
@@ -187,18 +187,18 @@ void nary_double_broadcast_vec_impl(
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPHX_DEVICE_SHARED
type
buffer
[
2048
/
vec_size
];
// Load bias into LDS
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
{
buffer
[
i
]
=
binput1
.
data
()[
i
];
}
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
{
buffer
[
i
+
bdim_vec_len
]
=
binput2
.
data
()[
i
];
}
__syncthreads
();
auto
*
bp
=
as_pointer
(
buffer
);
// Process the data
for
(
size_
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
for
(
in
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
{
auto
bidx
=
broadcast_idx
(
i
*
vec_size
);
auto
b1
=
bp
[
bidx
];
...
...
@@ -242,17 +242,17 @@ void nary_double_broadcast_impl(
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPHX_DEVICE_SHARED
type
buffer
[
2048
];
// Load bias into LDS
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
{
buffer
[
i
]
=
binput1
.
data
()[
i
];
}
for
(
size_
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
for
(
in
t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
{
buffer
[
i
+
bdim_len
]
=
binput2
.
data
()[
i
];
}
__syncthreads
();
// Process the data
for
(
size_
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
for
(
in
t
i
=
idx
.
global
;
i
<
nelements
;
i
+=
nglobal
)
{
auto
bidx
=
broadcast_idx
(
i
);
auto
b1
=
buffer
[
bidx
];
...
...
src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
View file @
94e3a2e4
...
...
@@ -37,7 +37,7 @@ struct id
struct
mean
{
size_
t
item_num
=
1
;
in
t
item_num
=
1
;
template
<
class
T
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
)
const
{
...
...
src/targets/gpu/device/int8_gemm_pack.cpp
View file @
94e3a2e4
...
...
@@ -16,20 +16,20 @@ void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument
auto
out_lens
=
comp_shape
.
lens
();
auto
dim_0
=
out_lens
.
size
()
-
2
;
auto
dim_1
=
out_lens
.
size
()
-
1
;
std
::
size_
t
lda
=
comp_shape
.
strides
()[
dim_0
];
std
::
size_
t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
in
t
lda
=
comp_shape
.
strides
()[
dim_0
];
in
t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_
t
nelements
=
comp_shape
.
elements
();
in
t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
__device__
{
const
size_
t
nb
=
4
;
const
in
t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_
t
i_m
=
idx
[
dim_1
];
std
::
size_
t
i_k
=
idx
[
dim_0
];
std
::
size_
t
offset
=
ii
/
m_size
*
m_size
;
in
t
i_m
=
idx
[
dim_1
];
in
t
i_k
=
idx
[
dim_0
];
in
t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
lda
)
*
nb
+
offset
]
=
in_ptr
[
i_m
+
i_k
*
lda
+
offset
];
});
...
...
@@ -43,24 +43,24 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
auto
out_lens
=
trans_shape
.
lens
();
auto
dim_0
=
trans_shape
.
lens
().
size
()
-
2
;
auto
dim_1
=
trans_shape
.
lens
().
size
()
-
1
;
std
::
size_
t
ldb
=
trans_shape
.
strides
()[
dim_1
];
in
t
ldb
=
trans_shape
.
strides
()[
dim_1
];
auto
wrap_lens
=
out_lens
;
std
::
swap
(
wrap_lens
[
dim_0
],
wrap_lens
[
dim_1
]);
shape
comp_shape
{
trans_shape
.
type
(),
wrap_lens
};
std
::
size_
t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
in
t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_
t
nelements
=
comp_shape
.
elements
();
in
t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
__device__
{
const
size_
t
nb
=
4
;
const
in
t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_
t
i_n
=
idx
[
dim_1
];
std
::
size_
t
i_k
=
idx
[
dim_0
];
std
::
size_
t
offset
=
ii
/
m_size
*
m_size
;
in
t
i_n
=
idx
[
dim_1
];
in
t
i_k
=
idx
[
dim_0
];
in
t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
ldb
)
*
nb
+
offset
]
=
in_ptr
[
i_n
+
i_k
*
ldb
+
offset
];
});
...
...
src/targets/gpu/device/layernorm.cpp
View file @
94e3a2e4
...
...
@@ -81,7 +81,7 @@ __device__ auto auto_block_reduce(index idx, Op op, T init, index_int n, F f)
template
<
index_int
MaxBlockSize
,
class
Input
,
class
Output
>
__device__
void
layernorm
(
index_int
i
,
index
idx
,
std
::
size_
t
block_size_div
,
in
t
block_size_div
,
index_int
relements
,
Input
input
,
Output
output
)
...
...
@@ -129,9 +129,9 @@ void layernorm_vec_impl(hipStream_t stream,
{
hip_vec_visit_all
<
N
>
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
const
auto
relements_v
=
relements
/
N
;
const
std
::
size_
t
max_block_size
=
256
;
const
std
::
size_
t
block_size
=
compute_block_size
(
relements_v
,
max_block_size
);
const
std
::
size_
t
block_size_div
=
encode_divisor
(
block_size
);
const
in
t
max_block_size
=
256
;
const
in
t
block_size
=
compute_block_size
(
relements_v
,
max_block_size
);
const
in
t
block_size_div
=
encode_divisor
(
block_size
);
assert
(
relements_v
<=
block_size
);
gs_launch
(
stream
,
nelements
*
block_size
,
block_size
)([
=
](
auto
i
,
auto
idx
)
__device__
{
...
...
@@ -158,9 +158,9 @@ void layernorm_impl(hipStream_t stream,
const
Arguments
&
...
args
)
{
hip_visit_all
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
const
std
::
size_
t
max_block_size
=
256
;
const
std
::
size_
t
block_size
=
compute_block_size
(
relements
,
max_block_size
);
const
std
::
size_
t
block_size_div
=
encode_divisor
(
block_size
);
const
in
t
max_block_size
=
256
;
const
in
t
block_size
=
compute_block_size
(
relements
,
max_block_size
);
const
in
t
block_size_div
=
encode_divisor
(
block_size
);
assert
(
relements
<=
block_size
);
gs_launch
(
stream
,
nelements
*
block_size
,
block_size
)([
=
](
auto
i
,
auto
idx
)
__device__
{
...
...
src/targets/gpu/device/multinomial.cpp
View file @
94e3a2e4
...
...
@@ -40,9 +40,9 @@ void multinomial(hipStream_t stream,
const
argument
&
arg0
,
const
argument
&
arg1
)
{
size_
t
batch_size
=
arg0
.
get_shape
().
lens
().
front
();
size_
t
class_size
=
arg0
.
get_shape
().
lens
().
back
();
size_
t
sample_size
=
result
.
get_shape
().
lens
().
back
();
in
t
batch_size
=
arg0
.
get_shape
().
lens
().
front
();
in
t
class_size
=
arg0
.
get_shape
().
lens
().
back
();
in
t
sample_size
=
result
.
get_shape
().
lens
().
back
();
hip_visit_all
(
arg0
,
arg1
)([
&
](
auto
cdf
,
auto
dist
)
{
result
.
visit
([
&
](
auto
out
)
{
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment