Commit 94e3a2e4 authored by Shucai Xiao's avatar Shucai Xiao
Browse files

change size_t to int

parent 26bd92d8
......@@ -38,7 +38,7 @@ dnnl_context& get_dnnl_context();
dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
dnnl::memory::format_tag to_dnnl_memory_format_tag(int n);
template <class R>
inline dnnl::memory::dims to_dnnl_dims(R&& r)
......@@ -105,14 +105,14 @@ struct dnnl_op : auto_register_op<Derived>
return {{"group", g}};
}
std::size_t get_extra_post_op_args() const
int get_extra_post_op_args() const
{
return std::count_if(post_ops.begin(), post_ops.end(), [](const auto& po) {
return contains(po.algo, "binary");
});
}
static std::size_t get_binary_post_op_arg(std::size_t pos)
static int get_binary_post_op_arg(int pos)
{
return MIGRAPHX_DNNL_PREFIX(ARG_ATTR_MULTIPLE_POST_OP)(pos) | // NOLINT
MIGRAPHX_DNNL_PREFIX(ARG_SRC_1); // NOLINT
......@@ -154,7 +154,7 @@ struct dnnl_op : auto_register_op<Derived>
strides.end(),
lens.begin(),
lens.begin(),
[](auto stride, auto len) -> std::size_t {
[](auto stride, auto len) -> int {
if(stride == 0)
return 1;
else
......@@ -182,7 +182,7 @@ struct dnnl_op : auto_register_op<Derived>
}
}
shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
std::vector<int> create_arg_map(std::size_t input_size) const
std::vector<int> create_arg_map(int input_size) const
{
const auto& self = static_cast<const Derived&>(*this);
auto npost_ops = get_extra_post_op_args();
......
......@@ -16,14 +16,14 @@ namespace cpu {
#ifdef MIGRAPHX_DISABLE_OMP
inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
inline int max_threads() { return std::thread::hardware_concurrency(); }
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
void parallel_for_impl(int n, int threadsize, F f)
{
if(threadsize <= 1)
{
f(std::size_t{0}, n);
f(int{0}, n);
}
else
{
......@@ -32,9 +32,9 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
#if(!defined(__GNUC__) || __GNUC__ != 5)
const
#endif
std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
int grainsize = std::ceil(static_cast<double>(n) / threads.size());
std::size_t work = 0;
int work = 0;
std::generate(threads.begin(), threads.end(), [=, &work] {
auto result =
joinable_thread([=]() mutable { f(work, std::min(n, work + grainsize)); });
......@@ -47,36 +47,36 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
}
#else
inline std::size_t max_threads() { return omp_get_max_threads(); }
inline int max_threads() { return omp_get_max_threads(); }
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
void parallel_for_impl(int n, int threadsize, F f)
{
if(threadsize <= 1)
{
f(std::size_t{0}, n);
f(int{0}, n);
}
else
{
std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
int grainsize = std::ceil(static_cast<double>(n) / threadsize);
#pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
for(std::size_t tid = 0; tid < threadsize; tid++)
for(int tid = 0; tid < threadsize; tid++)
{
std::size_t work = tid * grainsize;
int work = tid * grainsize;
f(work, std::min(n, work + grainsize));
}
}
}
#endif
template <class F>
void parallel_for(std::size_t n, std::size_t min_grain, F f)
void parallel_for(int n, int min_grain, F f)
{
const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
const auto threadsize = std::min<int>(max_threads(), n / min_grain);
parallel_for_impl(n, threadsize, f);
}
template <class F>
void parallel_for(std::size_t n, F f)
void parallel_for(int n, F f)
{
const int min_grain = 8;
parallel_for(n, min_grain, f);
......
......@@ -16,26 +16,26 @@ struct multi_index
{
constexpr multi_index() = default;
multi_index(const shape& s, std::size_t i) : n(s.lens().size())
multi_index(const shape& s, int i) : n(s.lens().size())
{
assert(n < max_size);
std::copy(s.lens().begin(), s.lens().end(), dims);
s.multi_copy(i, index, index + max_size);
}
constexpr std::size_t size() const { return n; }
constexpr int size() const { return n; }
constexpr std::size_t* begin() { return index; }
constexpr const std::size_t* begin() const { return index; }
constexpr int* begin() { return index; }
constexpr const int* begin() const { return index; }
constexpr std::size_t* end() { return index + size(); }
constexpr const std::size_t* end() const { return index + size(); }
constexpr int* end() { return index + size(); }
constexpr const int* end() const { return index + size(); }
std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
int offset(const shape& s) const { return s.index(begin(), end()); }
constexpr void carry()
{
std::size_t overflow = 0;
int overflow = 0;
for(std::ptrdiff_t i = size() - 1; i > 0; i--)
{
auto z = index[i] + overflow;
......@@ -57,13 +57,13 @@ struct multi_index
index[0] += overflow;
}
constexpr void increment(std::size_t i)
constexpr void increment(int i)
{
index[size() - 1] += i;
carry();
}
constexpr multi_index& operator+=(std::size_t i)
constexpr multi_index& operator+=(int i)
{
increment(i);
return *this;
......@@ -82,10 +82,10 @@ struct multi_index
}
private:
static const std::size_t max_size = 5;
std::size_t index[max_size] = {};
std::size_t dims[max_size] = {};
std::size_t n = 0;
static const int max_size = 5;
int index[max_size] = {};
int dims[max_size] = {};
int n = 0;
};
struct reduce_dims_base
......@@ -97,7 +97,7 @@ struct reduce_dims_base
reduce_shapes = reduce_dims(inputs);
}
argument get_arg(const std::vector<argument>& args, std::size_t i) const
argument get_arg(const std::vector<argument>& args, int i) const
{
if(reduce_shapes.empty())
return args[i];
......@@ -111,7 +111,7 @@ struct reduce_dims_base
}
};
template <class T, std::size_t N>
template <class T, int N>
struct vec
{
using array_type = std::array<T, N>;
......@@ -126,19 +126,19 @@ struct vec
};
template <class T>
constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
constexpr std::integral_constant<int, 0> vec_size(const T&)
{
return {};
}
template <class T, std::size_t N>
constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
template <class T, int N>
constexpr std::integral_constant<int, N> vec_size(const vec<T, N>&)
{
return {};
}
template <class T>
constexpr std::size_t vec_size()
constexpr int vec_size()
{
return decltype(vec_size(std::declval<T>())){};
}
......@@ -148,7 +148,7 @@ void vec_apply(F f, V& v, Vs... vs)
{
assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
assert(vec_size<V>() == v.array.size());
for(std::size_t i = 0; i < vec_size<V>(); i++)
for(int i = 0; i < vec_size<V>(); i++)
f(v.array[i], vs.vector[i]...);
}
......@@ -158,9 +158,9 @@ void vec_apply(F f, V& v, Vs&... vs)
f(v, vs...);
}
inline std::size_t find_packed_len(const shape& s)
inline int find_packed_len(const shape& s)
{
for(std::size_t i = 0; i < s.lens().size(); i++)
for(int i = 0; i < s.lens().size(); i++)
{
if(s.lens()[i] > 1 and s.strides()[i] == 1)
{
......@@ -170,7 +170,7 @@ inline std::size_t find_packed_len(const shape& s)
return -1;
}
template <std::size_t N>
template <int N>
shape vectorize(const shape& s)
{
assert(s.standard() or s.broadcasted());
......@@ -188,7 +188,7 @@ shape vectorize(const shape& s)
return {s.type(), lens};
}
template <std::size_t N, class T>
template <int N, class T>
tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
{
return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
......@@ -209,7 +209,7 @@ struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...
{
};
template <std::size_t N, class... Xs>
template <int N, class... Xs>
bool is_vectorizable(const Xs&... xs)
{
return all_of({xs...}, [](const auto& s) {
......@@ -223,7 +223,7 @@ bool is_vectorizable(const Xs&... xs)
s.strides().begin(),
0,
std::plus<>{},
[&](auto len, auto stride) -> std::size_t {
[&](auto len, auto stride) -> int {
if(stride > 0 and len == 1)
return 0;
return stride;
......@@ -272,7 +272,7 @@ bool is_standard_offset(const X& x, const Xs&... xs)
template <class... Ts>
auto pointwise_apply(Ts... ts)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
return [=](context& ctx, const shape& base_shape, int min_grain, auto f) mutable {
if(is_standard_offset(ts.get_shape()...))
{
ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
......@@ -300,7 +300,7 @@ auto pointwise_apply(Ts... ts)
template <class... Ts>
auto pointwise(Ts... ts)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
return [=](context& ctx, const shape& base_shape, int min_grain, auto f) mutable {
auto_vectorize(base_shape, ts...)(
[&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
};
......
......@@ -77,35 +77,35 @@ struct cpu_im2col
auto input_shape = args[0].get_shape();
auto weights_shape = args[1].get_shape();
visit_all(result, args[0])([&](auto col, auto input) {
const std::size_t& height = input_shape.lens()[2];
const std::size_t& width = input_shape.lens()[3];
const std::size_t& channels = weights_shape.lens()[1];
const std::size_t& kernel_h = weights_shape.lens()[2];
const std::size_t& kernel_w = weights_shape.lens()[3];
const std::size_t& pad_h = op.padding[0];
const std::size_t& pad_w = op.padding[1];
const std::size_t& stride_h = op.stride[0];
const std::size_t& stride_w = op.stride[1];
const int& height = input_shape.lens()[2];
const int& width = input_shape.lens()[3];
const int& channels = weights_shape.lens()[1];
const int& kernel_h = weights_shape.lens()[2];
const int& kernel_w = weights_shape.lens()[3];
const int& pad_h = op.padding[0];
const int& pad_w = op.padding[1];
const int& stride_h = op.stride[0];
const int& stride_w = op.stride[1];
long kdiv2_h = long(kernel_h) / 2;
long kdiv2_w = long(kernel_w) / 2;
// calculate output sizes
const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
const std::size_t col_width = (width - kernel_w + 2 * pad_w) / stride_w + 1;
const int col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
const int col_width = (width - kernel_w + 2 * pad_w) / stride_w + 1;
// account for padding for the starting position of the input pixels
long iinput = kdiv2_h - long(pad_h);
// loop over output pixels (ioutput, joutput)
for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
for(int ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
{
long jinput = kdiv2_w - long(pad_w);
for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
for(int joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
{
// compute linear index for output
std::size_t ldx = ioutput * col_width + joutput;
std::size_t p = 0;
int ldx = ioutput * col_width + joutput;
int p = 0;
dfor(channels,
kernel_h,
kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
kernel_w)([&](int c, int koffset, int loffset) {
auto idx = iinput + long(koffset) - kdiv2_h;
auto jdx = jinput + long(loffset) - kdiv2_w;
col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
......@@ -177,7 +177,7 @@ struct cpu_pad
visit_all(result, args[0])([&](auto output, auto input) {
shape_for_each(input.get_shape(), [&](const auto& idx) {
std::vector<std::size_t> new_idx(idx.size());
std::vector<int> new_idx(idx.size());
std::transform(
idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
return i + j;
......@@ -307,7 +307,7 @@ struct cpu_apply
outputs_alias.begin(),
[](const auto& i) { return instruction::get_output_alias(i); });
std::size_t index = 0;
int index = 0;
for(auto ins : outputs_alias)
{
prog_output_names[ins] = modl->name() + ":#output_" + std::to_string(index++);
......
......@@ -26,7 +26,7 @@ struct max_pool
return (m);
}
static double final(double x, std::size_t) { return (x); }
static double final(double x, int) { return (x); }
};
struct avg_pool
......@@ -41,7 +41,7 @@ struct avg_pool
static double apply(double x, double y) { return x + y; }
static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
static double final(double x, int y) { return (y == 0) ? 0.0 : (x / y); }
};
template <class Op>
......@@ -77,14 +77,14 @@ struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
using type = typename decltype(output)::value_type;
auto in_s = input.get_shape();
auto in_lens = in_s.lens();
std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
std::vector<int> vec_len(in_lens.begin() + 2, in_lens.end());
par_for(output_shape.elements(), [&](auto i) {
auto idx_o = output_shape.multi(i);
auto n_dim = idx_o.size();
std::vector<std::size_t> win_start;
std::vector<std::size_t> win_size;
for(std::size_t dim = 2; dim < n_dim; ++dim)
std::vector<int> win_start;
std::vector<int> win_size;
for(int dim = 2; dim < n_dim; ++dim)
{
auto d_2 = dim - 2;
int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
......@@ -131,8 +131,8 @@ struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::po
{
auto algo = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
auto kdims = op.kdims();
std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
std::vector<int> padding_l(op.padding.begin(), op.padding.begin() + kdims);
std::vector<int> padding_r(op.padding.begin() + kdims, op.padding.end());
return {dnnl::prop_kind::forward_inference,
algo,
m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
......
......@@ -11,14 +11,14 @@ namespace gpu {
struct hip_stream_model
{
std::size_t max_stream = 0;
std::unordered_map<migraphx::instruction_ref, std::size_t> ins2stream{};
std::size_t get_nstream() const { return max_stream + 1; }
std::size_t get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
std::size_t get_event_id(migraphx::instruction_ref ins) const
int max_stream = 0;
std::unordered_map<migraphx::instruction_ref, int> ins2stream{};
int get_nstream() const { return max_stream + 1; }
int get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
int get_event_id(migraphx::instruction_ref ins) const
{
auto v = ins->get_operator().to_value();
return v["event"].to<std::size_t>();
return v["event"].to<int>();
}
bool has_stream(migraphx::instruction_ref ins) const { return ins2stream.count(ins) > 0; }
bool is_record(migraphx::instruction_ref ins) const
......@@ -31,13 +31,13 @@ struct hip_stream_model
stream_model make_stream_model(const module& p)
{
hip_stream_model m;
std::size_t stream = 0;
int stream = 0;
for(auto ins : iterator_for(p))
{
if(ins->name() == "gpu::set_stream")
{
auto v = ins->get_operator().to_value();
stream = v["stream"].to<std::size_t>();
stream = v["stream"].to<int>();
m.max_stream = std::max(stream, m.max_stream);
}
if(ins->get_operator().is_context_free())
......
......@@ -18,8 +18,8 @@ inline shape reshape_to_2d(const shape& input)
if(dims.size() >= 4)
return input;
std::vector<size_t> new_dims(dims.begin(), dims.end());
std::size_t num = 4 - dims.size();
std::vector<int> new_dims(dims.begin(), dims.end());
int num = 4 - dims.size();
new_dims.insert(new_dims.end(), num, 1);
return {input.type(), new_dims};
}
......
......@@ -67,7 +67,7 @@ struct hiprtc_program
string_array() {}
string_array(const string_array&) = delete;
std::size_t size() const { return strings.size(); }
int size() const { return strings.size(); }
const char** data() { return c_strs.data(); }
......@@ -125,7 +125,7 @@ struct hiprtc_program
std::string log()
{
std::size_t n = 0;
int n = 0;
MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
if(n < 2)
return {};
......@@ -137,7 +137,7 @@ struct hiprtc_program
std::vector<char> get_code_obj()
{
std::size_t n = 0;
int n = 0;
MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
std::vector<char> buffer(n);
MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
......@@ -231,17 +231,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
return {compiler.compile(srcs)};
}
std::string enum_params(std::size_t count, std::string param)
std::string enum_params(int count, std::string param)
{
std::vector<std::string> items(count);
transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
return join_strings(items, ",");
}
std::size_t compute_global(std::size_t n, std::size_t local)
int compute_global(int n, int local)
{
std::size_t groups = (n + local - 1) / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
int groups = (n + local - 1) / local;
int nglobal = std::min<int>(256, groups) * local;
return nglobal;
}
......
......@@ -35,7 +35,7 @@ struct make_tensor<${n}>
};
)__migraphx__";
std::string generate_make_tensor(std::size_t n, const shape& s)
std::string generate_make_tensor(int n, const shape& s)
{
return interpolate_string(make_tensor_template,
{{"n", std::to_string(n)},
......@@ -47,7 +47,7 @@ std::string generate_make_tensor(std::size_t n, const shape& s)
std::string generate_args_hpp(const std::vector<shape>& inputs)
{
std::string inner;
for(std::size_t i = 0; i < inputs.size(); i++)
for(int i = 0; i < inputs.size(); i++)
{
inner += generate_make_tensor(i, inputs[i]);
}
......
......@@ -73,7 +73,7 @@ struct compiled_result
};
template <class F>
void par_compile(std::size_t n, F f)
void par_compile(int n, F f)
{
if(n == 0)
return;
......
......@@ -16,7 +16,7 @@ argument hip_concat::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
std::vector<int> offsets = op.compute_offsets(output_shape, args);
return device::concat(ctx.get_stream().get(), output_shape, args, offsets);
}
......
......@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)
if(dims.size() == 3)
{
std::vector<size_t> new_dims = dims;
std::vector<int> new_dims = dims;
new_dims.insert(new_dims.begin() + 2, 1);
new_shape = shape{input.type(), new_dims};
}
......@@ -71,7 +71,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
cd.get(),
y_desc.get(),
&workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}};
workspace_shape = shape{shape::int8_type, {static_cast<int>(workspace_size)}};
auto x = to_gpu(generate_argument(inputs[0]));
auto w = to_gpu(generate_argument(inputs[1]));
......@@ -98,7 +98,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
algo = perf.fwd_algo;
size_t solution_count;
std::size_t solution_count;
status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
w_desc.get(),
......@@ -124,7 +124,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
solution_id = solutions.front().solution_id;
return shape{shape::int8_type, {perf.memory}};
return shape{shape::int8_type, {static_cast<int>(perf.memory)}};
}
void miopen_convolution::finalize(context& ctx,
......
......@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)
if(dims.size() == 3)
{
std::vector<size_t> new_dims = dims;
std::vector<int> new_dims = dims;
new_dims.insert(new_dims.begin() + 2, 1);
new_shape = shape{input.type(), new_dims};
}
......@@ -72,7 +72,7 @@ shape miopen_deconvolution::compile(context& ctx,
cd.get(),
y_desc.get(),
&workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}};
workspace_shape = shape{shape::int8_type, {static_cast<int>(workspace_size)}};
auto x = to_gpu(generate_argument(inputs[0]));
auto w = to_gpu(generate_argument(inputs[1]));
......@@ -99,7 +99,7 @@ shape miopen_deconvolution::compile(context& ctx,
MIGRAPHX_THROW("Find deconvolution failed");
handle = ctx.get_stream().get_miopen();
algo = perf.fwd_algo;
return shape{shape::int8_type, {perf.memory}};
return shape{shape::int8_type, {static_cast<int>(perf.memory)}};
}
void miopen_deconvolution::finalize(context& ctx,
......
......@@ -11,10 +11,10 @@ namespace device {
argument concat(hipStream_t stream,
const migraphx::shape&,
std::vector<migraphx::argument> args,
std::vector<std::size_t> offsets)
std::vector<int> offsets)
{
auto ninputs = args.size() - 1;
for(std::size_t j = 0; j < ninputs; j++)
for(int j = 0; j < ninputs; j++)
{
auto&& arg = args[j];
auto offset = offsets[j];
......
......@@ -17,7 +17,7 @@ argument gather(hipStream_t stream, argument result, argument arg1, argument arg
auto axis_dim_size = lens[axis];
lens[axis] = arg2.get_shape().elements();
shape out_comp_shape{result.get_shape().type(), lens};
std::size_t nelements = result.get_shape().elements();
int nelements = result.get_shape().elements();
visit_all(result, arg1)([&](auto output, auto input_v) {
hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {
......
......@@ -96,14 +96,14 @@ void nary_broadcast_vec_impl(
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
for(int i = idx.local; i < bdim_vec_len; i += nlocal)
{
buffer[i] = binput.data()[i];
}
__syncthreads();
auto* bp = as_pointer(buffer);
// Process the data
for(size_t i = idx.global; i < nelements; i += nglobal)
for(int i = idx.global; i < nelements; i += nglobal)
{
auto bidx = broadcast_idx(i * vec_size);
auto b = bp[bidx];
......@@ -141,13 +141,13 @@ void nary_broadcast_impl(hipStream_t stream, F f, argument result, argument barg
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
for(int i = idx.local; i < bdim_len; i += nlocal)
{
buffer[i] = binput.data()[i];
}
__syncthreads();
// Process the data
for(size_t i = idx.global; i < nelements; i += nglobal)
for(int i = idx.global; i < nelements; i += nglobal)
{
auto bidx = broadcast_idx(i);
auto b = buffer[bidx];
......@@ -187,18 +187,18 @@ void nary_double_broadcast_vec_impl(
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
for(int i = idx.local; i < bdim_vec_len; i += nlocal)
{
buffer[i] = binput1.data()[i];
}
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
for(int i = idx.local; i < bdim_vec_len; i += nlocal)
{
buffer[i + bdim_vec_len] = binput2.data()[i];
}
__syncthreads();
auto* bp = as_pointer(buffer);
// Process the data
for(size_t i = idx.global; i < nelements; i += nglobal)
for(int i = idx.global; i < nelements; i += nglobal)
{
auto bidx = broadcast_idx(i * vec_size);
auto b1 = bp[bidx];
......@@ -242,17 +242,17 @@ void nary_double_broadcast_impl(
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
for(int i = idx.local; i < bdim_len; i += nlocal)
{
buffer[i] = binput1.data()[i];
}
for(size_t i = idx.local; i < bdim_len; i += nlocal)
for(int i = idx.local; i < bdim_len; i += nlocal)
{
buffer[i + bdim_len] = binput2.data()[i];
}
__syncthreads();
// Process the data
for(size_t i = idx.global; i < nelements; i += nglobal)
for(int i = idx.global; i < nelements; i += nglobal)
{
auto bidx = broadcast_idx(i);
auto b1 = buffer[bidx];
......
......@@ -37,7 +37,7 @@ struct id
struct mean
{
size_t item_num = 1;
int item_num = 1;
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
{
......
......@@ -16,20 +16,20 @@ void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument
auto out_lens = comp_shape.lens();
auto dim_0 = out_lens.size() - 2;
auto dim_1 = out_lens.size() - 1;
std::size_t lda = comp_shape.strides()[dim_0];
std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
int lda = comp_shape.strides()[dim_0];
int m_size = out_lens[dim_0] * out_lens[dim_1];
visit_all(result, arg)([&](auto output, auto input) {
std::size_t nelements = comp_shape.elements();
int nelements = comp_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(out_lens.size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(comp_shape);
gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
const size_t nb = 4;
const int nb = 4;
auto idx = desc.multi(ii);
std::size_t i_m = idx[dim_1];
std::size_t i_k = idx[dim_0];
std::size_t offset = ii / m_size * m_size;
int i_m = idx[dim_1];
int i_k = idx[dim_0];
int offset = ii / m_size * m_size;
out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
in_ptr[i_m + i_k * lda + offset];
});
......@@ -43,24 +43,24 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
auto out_lens = trans_shape.lens();
auto dim_0 = trans_shape.lens().size() - 2;
auto dim_1 = trans_shape.lens().size() - 1;
std::size_t ldb = trans_shape.strides()[dim_1];
int ldb = trans_shape.strides()[dim_1];
auto wrap_lens = out_lens;
std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
shape comp_shape{trans_shape.type(), wrap_lens};
std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
int m_size = out_lens[dim_0] * out_lens[dim_1];
visit_all(result, arg)([&](auto output, auto input) {
std::size_t nelements = comp_shape.elements();
int nelements = comp_shape.elements();
auto* out_ptr = device_cast(output.data());
auto* in_ptr = device_cast(input.data());
visit_tensor_size(out_lens.size(), [&](auto out_dim) {
hip_tensor_descriptor<out_dim> desc(comp_shape);
gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
const size_t nb = 4;
const int nb = 4;
auto idx = desc.multi(ii);
std::size_t i_n = idx[dim_1];
std::size_t i_k = idx[dim_0];
std::size_t offset = ii / m_size * m_size;
int i_n = idx[dim_1];
int i_k = idx[dim_0];
int offset = ii / m_size * m_size;
out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
in_ptr[i_n + i_k * ldb + offset];
});
......
......@@ -81,7 +81,7 @@ __device__ auto auto_block_reduce(index idx, Op op, T init, index_int n, F f)
template <index_int MaxBlockSize, class Input, class Output>
__device__ void layernorm(index_int i,
index idx,
std::size_t block_size_div,
int block_size_div,
index_int relements,
Input input,
Output output)
......@@ -129,9 +129,9 @@ void layernorm_vec_impl(hipStream_t stream,
{
hip_vec_visit_all<N>(result, args...)([&](auto output, auto... inputs) {
const auto relements_v = relements / N;
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(relements_v, max_block_size);
const std::size_t block_size_div = encode_divisor(block_size);
const int max_block_size = 256;
const int block_size = compute_block_size(relements_v, max_block_size);
const int block_size_div = encode_divisor(block_size);
assert(relements_v <= block_size);
gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
......@@ -158,9 +158,9 @@ void layernorm_impl(hipStream_t stream,
const Arguments&... args)
{
hip_visit_all(result, args...)([&](auto output, auto... inputs) {
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(relements, max_block_size);
const std::size_t block_size_div = encode_divisor(block_size);
const int max_block_size = 256;
const int block_size = compute_block_size(relements, max_block_size);
const int block_size_div = encode_divisor(block_size);
assert(relements <= block_size);
gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
......
......@@ -40,9 +40,9 @@ void multinomial(hipStream_t stream,
const argument& arg0,
const argument& arg1)
{
size_t batch_size = arg0.get_shape().lens().front();
size_t class_size = arg0.get_shape().lens().back();
size_t sample_size = result.get_shape().lens().back();
int batch_size = arg0.get_shape().lens().front();
int class_size = arg0.get_shape().lens().back();
int sample_size = result.get_shape().lens().back();
hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
result.visit([&](auto out) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment