change size_t to int

94e3a2e4 · Shucai Xiao · 26bd92d8 · 94e3a2e4 · 94e3a2e4 · 94e3a2e4
Commit 94e3a2e4 authored Feb 12, 2022 by Shucai Xiao
20 changed files
--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -38,7 +38,7 @@ dnnl_context& get_dnnl_context();

 dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);

-dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
+dnnl::memory::format_tag to_dnnl_memory_format_tag(int n);

 template <class R>
 inline dnnl::memory::dims to_dnnl_dims(R&& r)
@@ -105,14 +105,14 @@ struct dnnl_op : auto_register_op<Derived>
        return {{"group", g}};
    }

-    std::size_t get_extra_post_op_args() const
+    int get_extra_post_op_args() const
    {
        return std::count_if(post_ops.begin(), post_ops.end(), [](const auto& po) {
            return contains(po.algo, "binary");
        });
    }

-    static std::size_t get_binary_post_op_arg(std::size_t pos)
+    static int get_binary_post_op_arg(int pos)
    {
        return MIGRAPHX_DNNL_PREFIX(ARG_ATTR_MULTIPLE_POST_OP)(pos) | // NOLINT
               MIGRAPHX_DNNL_PREFIX(ARG_SRC_1);                       // NOLINT
@@ -154,7 +154,7 @@ struct dnnl_op : auto_register_op<Derived>
                           strides.end(),
                           lens.begin(),
                           lens.begin(),
-                           [](auto stride, auto len) -> std::size_t {
+                           [](auto stride, auto len) -> int {
                               if(stride == 0)
                                   return 1;
                               else
@@ -182,7 +182,7 @@ struct dnnl_op : auto_register_op<Derived>
        }
    }
    shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
-    std::vector<int> create_arg_map(std::size_t input_size) const
+    std::vector<int> create_arg_map(int input_size) const
    {
        const auto& self     = static_cast<const Derived&>(*this);
        auto npost_ops       = get_extra_post_op_args();

--- a/src/targets/cpu/include/migraphx/cpu/parallel.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/parallel.hpp
@@ -16,14 +16,14 @@ namespace cpu {

 #ifdef MIGRAPHX_DISABLE_OMP

-inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
+inline int max_threads() { return std::thread::hardware_concurrency(); }

 template <class F>
-void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+void parallel_for_impl(int n, int threadsize, F f)
 {
    if(threadsize <= 1)
    {
-        f(std::size_t{0}, n);
+        f(int{0}, n);
    }
    else
    {
@@ -32,9 +32,9 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
 #if(!defined(__GNUC__) || __GNUC__ != 5)
        const
 #endif
-            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+            int grainsize = std::ceil(static_cast<double>(n) / threads.size());

-        std::size_t work = 0;
+        int work = 0;
        std::generate(threads.begin(), threads.end(), [=, &work] {
            auto result =
                joinable_thread([=]() mutable { f(work, std::min(n, work + grainsize)); });
@@ -47,36 +47,36 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
 }
 #else

-inline std::size_t max_threads() { return omp_get_max_threads(); }
+inline int max_threads() { return omp_get_max_threads(); }

 template <class F>
-void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+void parallel_for_impl(int n, int threadsize, F f)
 {
    if(threadsize <= 1)
    {
-        f(std::size_t{0}, n);
+        f(int{0}, n);
    }
    else
    {
-        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
+        int grainsize = std::ceil(static_cast<double>(n) / threadsize);
 #pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
-        for(std::size_t tid = 0; tid < threadsize; tid++)
+        for(int tid = 0; tid < threadsize; tid++)
        {
-            std::size_t work = tid * grainsize;
+            int work = tid * grainsize;
            f(work, std::min(n, work + grainsize));
        }
    }
 }
 #endif
 template <class F>
-void parallel_for(std::size_t n, std::size_t min_grain, F f)
+void parallel_for(int n, int min_grain, F f)
 {
-    const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
+    const auto threadsize = std::min<int>(max_threads(), n / min_grain);
    parallel_for_impl(n, threadsize, f);
 }

 template <class F>
-void parallel_for(std::size_t n, F f)
+void parallel_for(int n, F f)
 {
    const int min_grain = 8;
    parallel_for(n, min_grain, f);

--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -16,26 +16,26 @@ struct multi_index
 {
    constexpr multi_index() = default;

-    multi_index(const shape& s, std::size_t i) : n(s.lens().size())
+    multi_index(const shape& s, int i) : n(s.lens().size())
    {
        assert(n < max_size);
        std::copy(s.lens().begin(), s.lens().end(), dims);
        s.multi_copy(i, index, index + max_size);
    }

-    constexpr std::size_t size() const { return n; }
+    constexpr int size() const { return n; }

-    constexpr std::size_t* begin() { return index; }
-    constexpr const std::size_t* begin() const { return index; }
+    constexpr int* begin() { return index; }
+    constexpr const int* begin() const { return index; }

-    constexpr std::size_t* end() { return index + size(); }
-    constexpr const std::size_t* end() const { return index + size(); }
+    constexpr int* end() { return index + size(); }
+    constexpr const int* end() const { return index + size(); }

-    std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
+    int offset(const shape& s) const { return s.index(begin(), end()); }

    constexpr void carry()
    {
-        std::size_t overflow = 0;
+        int overflow = 0;
        for(std::ptrdiff_t i = size() - 1; i > 0; i--)
        {
            auto z = index[i] + overflow;
@@ -57,13 +57,13 @@ struct multi_index
        index[0] += overflow;
    }

-    constexpr void increment(std::size_t i)
+    constexpr void increment(int i)
    {
        index[size() - 1] += i;
        carry();
    }

-    constexpr multi_index& operator+=(std::size_t i)
+    constexpr multi_index& operator+=(int i)
    {
        increment(i);
        return *this;
@@ -82,10 +82,10 @@ struct multi_index
    }

    private:
-    static const std::size_t max_size = 5;
-    std::size_t index[max_size]       = {};
-    std::size_t dims[max_size]        = {};
-    std::size_t n                     = 0;
+    static const int max_size = 5;
+    int index[max_size]       = {};
+    int dims[max_size]        = {};
+    int n                     = 0;
 };

 struct reduce_dims_base
@@ -97,7 +97,7 @@ struct reduce_dims_base
        reduce_shapes = reduce_dims(inputs);
    }

-    argument get_arg(const std::vector<argument>& args, std::size_t i) const
+    argument get_arg(const std::vector<argument>& args, int i) const
    {
        if(reduce_shapes.empty())
            return args[i];
@@ -111,7 +111,7 @@ struct reduce_dims_base
    }
 };

-template <class T, std::size_t N>
+template <class T, int N>
 struct vec
 {
    using array_type                                              = std::array<T, N>;
@@ -126,19 +126,19 @@ struct vec
 };

 template <class T>
-constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
+constexpr std::integral_constant<int, 0> vec_size(const T&)
 {
    return {};
 }

-template <class T, std::size_t N>
-constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
+template <class T, int N>
+constexpr std::integral_constant<int, N> vec_size(const vec<T, N>&)
 {
    return {};
 }

 template <class T>
-constexpr std::size_t vec_size()
+constexpr int vec_size()
 {
    return decltype(vec_size(std::declval<T>())){};
 }
@@ -148,7 +148,7 @@ void vec_apply(F f, V& v, Vs... vs)
 {
    assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
    assert(vec_size<V>() == v.array.size());
-    for(std::size_t i = 0; i < vec_size<V>(); i++)
+    for(int i = 0; i < vec_size<V>(); i++)
        f(v.array[i], vs.vector[i]...);
 }

@@ -158,9 +158,9 @@ void vec_apply(F f, V& v, Vs&... vs)
    f(v, vs...);
 }

-inline std::size_t find_packed_len(const shape& s)
+inline int find_packed_len(const shape& s)
 {
-    for(std::size_t i = 0; i < s.lens().size(); i++)
+    for(int i = 0; i < s.lens().size(); i++)
    {
        if(s.lens()[i] > 1 and s.strides()[i] == 1)
        {
@@ -170,7 +170,7 @@ inline std::size_t find_packed_len(const shape& s)
    return -1;
 }

-template <std::size_t N>
+template <int N>
 shape vectorize(const shape& s)
 {
    assert(s.standard() or s.broadcasted());
@@ -188,7 +188,7 @@ shape vectorize(const shape& s)
    return {s.type(), lens};
 }

-template <std::size_t N, class T>
+template <int N, class T>
 tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
 {
    return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
@@ -209,7 +209,7 @@ struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...
 {
 };

-template <std::size_t N, class... Xs>
+template <int N, class... Xs>
 bool is_vectorizable(const Xs&... xs)
 {
    return all_of({xs...}, [](const auto& s) {
@@ -223,7 +223,7 @@ bool is_vectorizable(const Xs&... xs)
                                        s.strides().begin(),
                                        0,
                                        std::plus<>{},
-                                        [&](auto len, auto stride) -> std::size_t {
+                                        [&](auto len, auto stride) -> int {
                                            if(stride > 0 and len == 1)
                                                return 0;
                                            return stride;
@@ -272,7 +272,7 @@ bool is_standard_offset(const X& x, const Xs&... xs)
 template <class... Ts>
 auto pointwise_apply(Ts... ts)
 {
-    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+    return [=](context& ctx, const shape& base_shape, int min_grain, auto f) mutable {
        if(is_standard_offset(ts.get_shape()...))
        {
            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
@@ -300,7 +300,7 @@ auto pointwise_apply(Ts... ts)
 template <class... Ts>
 auto pointwise(Ts... ts)
 {
-    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+    return [=](context& ctx, const shape& base_shape, int min_grain, auto f) mutable {
        auto_vectorize(base_shape, ts...)(
            [&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
    };

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -77,35 +77,35 @@ struct cpu_im2col
        auto input_shape   = args[0].get_shape();
        auto weights_shape = args[1].get_shape();
        visit_all(result, args[0])([&](auto col, auto input) {
-            const std::size_t& height   = input_shape.lens()[2];
-            const std::size_t& width    = input_shape.lens()[3];
-            const std::size_t& channels = weights_shape.lens()[1];
-            const std::size_t& kernel_h = weights_shape.lens()[2];
-            const std::size_t& kernel_w = weights_shape.lens()[3];
-            const std::size_t& pad_h    = op.padding[0];
-            const std::size_t& pad_w    = op.padding[1];
-            const std::size_t& stride_h = op.stride[0];
-            const std::size_t& stride_w = op.stride[1];
+            const int& height   = input_shape.lens()[2];
+            const int& width    = input_shape.lens()[3];
+            const int& channels = weights_shape.lens()[1];
+            const int& kernel_h = weights_shape.lens()[2];
+            const int& kernel_w = weights_shape.lens()[3];
+            const int& pad_h    = op.padding[0];
+            const int& pad_w    = op.padding[1];
+            const int& stride_h = op.stride[0];
+            const int& stride_w = op.stride[1];

            long kdiv2_h = long(kernel_h) / 2;
            long kdiv2_w = long(kernel_w) / 2;
            // calculate output sizes
-            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
-            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
+            const int col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
+            const int col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
            // account for padding for the starting position of the input pixels
            long iinput = kdiv2_h - long(pad_h);
            // loop over output pixels (ioutput, joutput)
-            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
+            for(int ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
            {
                long jinput = kdiv2_w - long(pad_w);
-                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
+                for(int joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
                {
                    // compute linear index for output
-                    std::size_t ldx = ioutput * col_width + joutput;
-                    std::size_t p   = 0;
+                    int ldx = ioutput * col_width + joutput;
+                    int p   = 0;
                    dfor(channels,
                         kernel_h,
-                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
+                         kernel_w)([&](int c, int koffset, int loffset) {
                        auto idx    = iinput + long(koffset) - kdiv2_h;
                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
@@ -177,7 +177,7 @@ struct cpu_pad

        visit_all(result, args[0])([&](auto output, auto input) {
            shape_for_each(input.get_shape(), [&](const auto& idx) {
-                std::vector<std::size_t> new_idx(idx.size());
+                std::vector<int> new_idx(idx.size());
                std::transform(
                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
                        return i + j;
@@ -307,7 +307,7 @@ struct cpu_apply
                           outputs_alias.begin(),
                           [](const auto& i) { return instruction::get_output_alias(i); });

-            std::size_t index = 0;
+            int index = 0;
            for(auto ins : outputs_alias)
            {
                prog_output_names[ins] = modl->name() + ":#output_" + std::to_string(index++);

--- a/src/targets/cpu/pooling.cpp
+++ b/src/targets/cpu/pooling.cpp
@@ -26,7 +26,7 @@ struct max_pool
        return (m);
    }

-    static double final(double x, std::size_t) { return (x); }
+    static double final(double x, int) { return (x); }
 };

 struct avg_pool
@@ -41,7 +41,7 @@ struct avg_pool

    static double apply(double x, double y) { return x + y; }

-    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    static double final(double x, int y) { return (y == 0) ? 0.0 : (x / y); }
 };

 template <class Op>
@@ -77,14 +77,14 @@ struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
            using type   = typename decltype(output)::value_type;
            auto in_s    = input.get_shape();
            auto in_lens = in_s.lens();
-            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
+            std::vector<int> vec_len(in_lens.begin() + 2, in_lens.end());

            par_for(output_shape.elements(), [&](auto i) {
                auto idx_o = output_shape.multi(i);
                auto n_dim = idx_o.size();
-                std::vector<std::size_t> win_start;
-                std::vector<std::size_t> win_size;
-                for(std::size_t dim = 2; dim < n_dim; ++dim)
+                std::vector<int> win_start;
+                std::vector<int> win_size;
+                for(int dim = 2; dim < n_dim; ++dim)
                {
                    auto d_2  = dim - 2;
                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
@@ -131,8 +131,8 @@ struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::po
    {
        auto algo  = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
        auto kdims = op.kdims();
-        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
-        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
+        std::vector<int> padding_l(op.padding.begin(), op.padding.begin() + kdims);
+        std::vector<int> padding_r(op.padding.begin() + kdims, op.padding.end());
        return {dnnl::prop_kind::forward_inference,
                algo,
                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),

--- a/src/targets/gpu/analyze_streams.cpp
+++ b/src/targets/gpu/analyze_streams.cpp
@@ -11,14 +11,14 @@ namespace gpu {

 struct hip_stream_model
 {
-    std::size_t max_stream = 0;
-    std::unordered_map<migraphx::instruction_ref, std::size_t> ins2stream{};
-    std::size_t get_nstream() const { return max_stream + 1; }
-    std::size_t get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
-    std::size_t get_event_id(migraphx::instruction_ref ins) const
+    int max_stream = 0;
+    std::unordered_map<migraphx::instruction_ref, int> ins2stream{};
+    int get_nstream() const { return max_stream + 1; }
+    int get_stream(migraphx::instruction_ref ins) const { return ins2stream.at(ins); }
+    int get_event_id(migraphx::instruction_ref ins) const
    {
        auto v = ins->get_operator().to_value();
-        return v["event"].to<std::size_t>();
+        return v["event"].to<int>();
    }
    bool has_stream(migraphx::instruction_ref ins) const { return ins2stream.count(ins) > 0; }
    bool is_record(migraphx::instruction_ref ins) const
@@ -31,13 +31,13 @@ struct hip_stream_model
 stream_model make_stream_model(const module& p)
 {
    hip_stream_model m;
-    std::size_t stream = 0;
+    int stream = 0;
    for(auto ins : iterator_for(p))
    {
        if(ins->name() == "gpu::set_stream")
        {
            auto v       = ins->get_operator().to_value();
-            stream       = v["stream"].to<std::size_t>();
+            stream       = v["stream"].to<int>();
            m.max_stream = std::max(stream, m.max_stream);
        }
        if(ins->get_operator().is_context_free())

--- a/src/targets/gpu/batch_norm_inference.cpp
+++ b/src/targets/gpu/batch_norm_inference.cpp
@@ -18,8 +18,8 @@ inline shape reshape_to_2d(const shape& input)
    if(dims.size() >= 4)
        return input;

-    std::vector<size_t> new_dims(dims.begin(), dims.end());
-    std::size_t num = 4 - dims.size();
+    std::vector<int> new_dims(dims.begin(), dims.end());
+    int num = 4 - dims.size();
    new_dims.insert(new_dims.end(), num, 1);
    return {input.type(), new_dims};
 }

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -67,7 +67,7 @@ struct hiprtc_program
        string_array() {}
        string_array(const string_array&) = delete;

-        std::size_t size() const { return strings.size(); }
+        int size() const { return strings.size(); }

        const char** data() { return c_strs.data(); }

@@ -125,7 +125,7 @@ struct hiprtc_program

    std::string log()
    {
-        std::size_t n = 0;
+        int n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
        if(n < 2)
            return {};
@@ -137,7 +137,7 @@ struct hiprtc_program

    std::vector<char> get_code_obj()
    {
-        std::size_t n = 0;
+        int n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
        std::vector<char> buffer(n);
        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
@@ -231,17 +231,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

-std::string enum_params(std::size_t count, std::string param)
+std::string enum_params(int count, std::string param)
 {
    std::vector<std::string> items(count);
    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
    return join_strings(items, ",");
 }

-std::size_t compute_global(std::size_t n, std::size_t local)
+int compute_global(int n, int local)
 {
-    std::size_t groups  = (n + local - 1) / local;
-    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
+    int groups  = (n + local - 1) / local;
+    int nglobal = std::min<int>(256, groups) * local;
    return nglobal;
 }


--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -35,7 +35,7 @@ struct make_tensor<${n}>
 };
 )__migraphx__";

-std::string generate_make_tensor(std::size_t n, const shape& s)
+std::string generate_make_tensor(int n, const shape& s)
 {
    return interpolate_string(make_tensor_template,
                              {{"n", std::to_string(n)},
@@ -47,7 +47,7 @@ std::string generate_make_tensor(std::size_t n, const shape& s)
 std::string generate_args_hpp(const std::vector<shape>& inputs)
 {
    std::string inner;
-    for(std::size_t i = 0; i < inputs.size(); i++)
+    for(int i = 0; i < inputs.size(); i++)
    {
        inner += generate_make_tensor(i, inputs[i]);
    }

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -73,7 +73,7 @@ struct compiled_result
 };

 template <class F>
-void par_compile(std::size_t n, F f)
+void par_compile(int n, F f)
 {
    if(n == 0)
        return;

--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
@@ -16,7 +16,7 @@ argument hip_concat::compute(context& ctx,
                             const shape& output_shape,
                             const std::vector<argument>& args) const
 {
-    std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
+    std::vector<int> offsets = op.compute_offsets(output_shape, args);
    return device::concat(ctx.get_stream().get(), output_shape, args, offsets);
 }


--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)

    if(dims.size() == 3)
    {
-        std::vector<size_t> new_dims = dims;
+        std::vector<int> new_dims = dims;
        new_dims.insert(new_dims.begin() + 2, 1);
        new_shape = shape{input.type(), new_dims};
    }
@@ -71,7 +71,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
                                             cd.get(),
                                             y_desc.get(),
                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
+    workspace_shape = shape{shape::int8_type, {static_cast<int>(workspace_size)}};

    auto x         = to_gpu(generate_argument(inputs[0]));
    auto w         = to_gpu(generate_argument(inputs[1]));
@@ -98,7 +98,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
        MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
    algo = perf.fwd_algo;

-    size_t solution_count;
+    std::size_t solution_count;

    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
                                                      w_desc.get(),
@@ -124,7 +124,7 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec

    solution_id = solutions.front().solution_id;

-    return shape{shape::int8_type, {perf.memory}};
+    return shape{shape::int8_type, {static_cast<int>(perf.memory)}};
 }

 void miopen_convolution::finalize(context& ctx,

--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
@@ -21,7 +21,7 @@ inline shape reshape_if_1d(const shape& input)

    if(dims.size() == 3)
    {
-        std::vector<size_t> new_dims = dims;
+        std::vector<int> new_dims = dims;
        new_dims.insert(new_dims.begin() + 2, 1);
        new_shape = shape{input.type(), new_dims};
    }
@@ -72,7 +72,7 @@ shape miopen_deconvolution::compile(context& ctx,
                                             cd.get(),
                                             y_desc.get(),
                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
+    workspace_shape = shape{shape::int8_type, {static_cast<int>(workspace_size)}};

    auto x         = to_gpu(generate_argument(inputs[0]));
    auto w         = to_gpu(generate_argument(inputs[1]));
@@ -99,7 +99,7 @@ shape miopen_deconvolution::compile(context& ctx,
        MIGRAPHX_THROW("Find deconvolution failed");
    handle = ctx.get_stream().get_miopen();
    algo   = perf.fwd_algo;
-    return shape{shape::int8_type, {perf.memory}};
+    return shape{shape::int8_type, {static_cast<int>(perf.memory)}};
 }

 void miopen_deconvolution::finalize(context& ctx,

--- a/src/targets/gpu/device/concat.cpp
+++ b/src/targets/gpu/device/concat.cpp
@@ -11,10 +11,10 @@ namespace device {
 argument concat(hipStream_t stream,
                const migraphx::shape&,
                std::vector<migraphx::argument> args,
-                std::vector<std::size_t> offsets)
+                std::vector<int> offsets)
 {
    auto ninputs = args.size() - 1;
-    for(std::size_t j = 0; j < ninputs; j++)
+    for(int j = 0; j < ninputs; j++)
    {
        auto&& arg        = args[j];
        auto offset       = offsets[j];

--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -17,7 +17,7 @@ argument gather(hipStream_t stream, argument result, argument arg1, argument arg
    auto axis_dim_size      = lens[axis];
    lens[axis]              = arg2.get_shape().elements();
    shape out_comp_shape{result.get_shape().type(), lens};
-    std::size_t nelements = result.get_shape().elements();
+    int nelements = result.get_shape().elements();

    visit_all(result, arg1)([&](auto output, auto input_v) {
        hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -96,14 +96,14 @@ void nary_broadcast_vec_impl(
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
-                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                for(int i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i] = binput.data()[i];
                }
                __syncthreads();
                auto* bp = as_pointer(buffer);
                // Process the data
-                for(size_t i = idx.global; i < nelements; i += nglobal)
+                for(int i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx = broadcast_idx(i * vec_size);
                    auto b    = bp[bidx];
@@ -141,13 +141,13 @@ void nary_broadcast_impl(hipStream_t stream, F f, argument result, argument barg
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
            MIGRAPHX_DEVICE_SHARED type buffer[2048];
            // Load bias into LDS
-            for(size_t i = idx.local; i < bdim_len; i += nlocal)
+            for(int i = idx.local; i < bdim_len; i += nlocal)
            {
                buffer[i] = binput.data()[i];
            }
            __syncthreads();
            // Process the data
-            for(size_t i = idx.global; i < nelements; i += nglobal)
+            for(int i = idx.global; i < nelements; i += nglobal)
            {
                auto bidx        = broadcast_idx(i);
                auto b           = buffer[bidx];
@@ -187,18 +187,18 @@ void nary_double_broadcast_vec_impl(
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
-                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                for(int i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i] = binput1.data()[i];
                }
-                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
+                for(int i = idx.local; i < bdim_vec_len; i += nlocal)
                {
                    buffer[i + bdim_vec_len] = binput2.data()[i];
                }
                __syncthreads();
                auto* bp = as_pointer(buffer);
                // Process the data
-                for(size_t i = idx.global; i < nelements; i += nglobal)
+                for(int i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx = broadcast_idx(i * vec_size);
                    auto b1   = bp[bidx];
@@ -242,17 +242,17 @@ void nary_double_broadcast_impl(
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048];
                // Load bias into LDS
-                for(size_t i = idx.local; i < bdim_len; i += nlocal)
+                for(int i = idx.local; i < bdim_len; i += nlocal)
                {
                    buffer[i] = binput1.data()[i];
                }
-                for(size_t i = idx.local; i < bdim_len; i += nlocal)
+                for(int i = idx.local; i < bdim_len; i += nlocal)
                {
                    buffer[i + bdim_len] = binput2.data()[i];
                }
                __syncthreads();
                // Process the data
-                for(size_t i = idx.global; i < nelements; i += nglobal)
+                for(int i = idx.global; i < nelements; i += nglobal)
                {
                    auto bidx        = broadcast_idx(i);
                    auto b1          = buffer[bidx];

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
@@ -37,7 +37,7 @@ struct id

 struct mean
 {
-    size_t item_num = 1;
+    int item_num = 1;
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
    {

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
@@ -16,20 +16,20 @@ void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument
    auto out_lens      = comp_shape.lens();
    auto dim_0         = out_lens.size() - 2;
    auto dim_1         = out_lens.size() - 1;
-    std::size_t lda    = comp_shape.strides()[dim_0];
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    int lda    = comp_shape.strides()[dim_0];
+    int m_size = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
+        int nelements = comp_shape.elements();
        auto* out_ptr         = device_cast(output.data());
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
+                const int nb    = 4;
                auto idx           = desc.multi(ii);
-                std::size_t i_m    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
+                int i_m    = idx[dim_1];
+                int i_k    = idx[dim_0];
+                int offset = ii / m_size * m_size;
                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
                    in_ptr[i_m + i_k * lda + offset];
            });
@@ -43,24 +43,24 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
    auto out_lens    = trans_shape.lens();
    auto dim_0       = trans_shape.lens().size() - 2;
    auto dim_1       = trans_shape.lens().size() - 1;
-    std::size_t ldb  = trans_shape.strides()[dim_1];
+    int ldb  = trans_shape.strides()[dim_1];

    auto wrap_lens = out_lens;
    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
    shape comp_shape{trans_shape.type(), wrap_lens};
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    int m_size = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
+        int nelements = comp_shape.elements();
        auto* out_ptr         = device_cast(output.data());
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
+                const int nb    = 4;
                auto idx           = desc.multi(ii);
-                std::size_t i_n    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
+                int i_n    = idx[dim_1];
+                int i_k    = idx[dim_0];
+                int offset = ii / m_size * m_size;
                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
                    in_ptr[i_n + i_k * ldb + offset];
            });

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -81,7 +81,7 @@ __device__ auto auto_block_reduce(index idx, Op op, T init, index_int n, F f)
 template <index_int MaxBlockSize, class Input, class Output>
 __device__ void layernorm(index_int i,
                          index idx,
-                          std::size_t block_size_div,
+                          int block_size_div,
                          index_int relements,
                          Input input,
                          Output output)
@@ -129,9 +129,9 @@ void layernorm_vec_impl(hipStream_t stream,
 {
    hip_vec_visit_all<N>(result, args...)([&](auto output, auto... inputs) {
        const auto relements_v           = relements / N;
-        const std::size_t max_block_size = 256;
-        const std::size_t block_size     = compute_block_size(relements_v, max_block_size);
-        const std::size_t block_size_div = encode_divisor(block_size);
+        const int max_block_size = 256;
+        const int block_size     = compute_block_size(relements_v, max_block_size);
+        const int block_size_div = encode_divisor(block_size);
        assert(relements_v <= block_size);

        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
@@ -158,9 +158,9 @@ void layernorm_impl(hipStream_t stream,
                    const Arguments&... args)
 {
    hip_visit_all(result, args...)([&](auto output, auto... inputs) {
-        const std::size_t max_block_size = 256;
-        const std::size_t block_size     = compute_block_size(relements, max_block_size);
-        const std::size_t block_size_div = encode_divisor(block_size);
+        const int max_block_size = 256;
+        const int block_size     = compute_block_size(relements, max_block_size);
+        const int block_size_div = encode_divisor(block_size);
        assert(relements <= block_size);

        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {

--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
@@ -40,9 +40,9 @@ void multinomial(hipStream_t stream,
                 const argument& arg0,
                 const argument& arg1)
 {
-    size_t batch_size  = arg0.get_shape().lens().front();
-    size_t class_size  = arg0.get_shape().lens().back();
-    size_t sample_size = result.get_shape().lens().back();
+    int batch_size  = arg0.get_shape().lens().front();
+    int class_size  = arg0.get_shape().lens().back();
+    int sample_size = result.get_shape().lens().back();

    hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
        result.visit([&](auto out) {