Merge branch 'unsqueeze-step' into fuse-horiz-contiguous2

c96b88b7 · Paul · 2a8e8d07 · c2c7f497 · c96b88b7 · c96b88b7
Commit c96b88b7 authored Jun 14, 2022 by Paul
20 changed files
--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -39,6 +39,8 @@ function(generate_embed_source EMBED_NAME)
    file(WRITE "${PARSE_HEADER}" "
 #include <unordered_map>
+#include <string>
+#include <utility>
 const std::unordered_map<std::string, std::pair<const char*,const char*>>& ${EMBED_NAME}();
 ")

--- a/src/include/migraphx/op/unsqueeze.hpp
+++ b/src/include/migraphx/op/unsqueeze.hpp
@@ -19,11 +19,12 @@ namespace op {
 struct unsqueeze
 {
    std::vector<int64_t> axes;
+    std::vector<int64_t> steps;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.axes, "axes"));
+        return pack(f(self.axes, "axes"), f(self.steps, "steps"));
    }
    value attributes() const
@@ -57,16 +58,27 @@ struct unsqueeze
        std::size_t p = 0;
        for(auto i : range(new_size))
        {
-            if(std::find(axes.begin(), axes.end(), i) != axes.end())
+            auto axis_idx = std::find(axes.begin(), axes.end(), i) - axes.begin();
+            if(axis_idx < axes.size())
            {
-                new_lens[i] = 1;
+                std::int64_t step = 1;
-                if(p == 0) // unsqueeze on the first axes
+                if(axis_idx < steps.size())
+                    step = steps[axis_idx];
+                if(step == 0)
+                    MIGRAPHX_THROW("UNSQUEEZE: step must be non-zero");
+                new_lens[i] = step;
+                if(p < old_strides.size())
                {
-                    new_strides[i] = old_lens[0] * old_strides[0];
+                    if((old_lens[p] % step) != 0)
+                        MIGRAPHX_THROW("UNSQUEEZE: Axis dimenstion is not divisible by step");
+                    old_lens[p] /= step;
+                    new_strides[i] = old_strides[p] * old_lens[p];
                }
-                else // unsqueeze on middle or last axes
+                else
                {
-                    new_strides[i] = (p < old_strides.size()) ? old_strides[p - 1] : 1;
+                    if(step != 1)
+                        MIGRAPHX_THROW("UNSQUEEZE: Step must be 1 for extra axes");
+                    new_strides[i] = 1;
                }
            }
            else

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -22,6 +22,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_FINALIZE)
 struct module_impl
 {
    // A list is used to keep references to an instruction stable
@@ -553,8 +555,14 @@ instruction_ref module::find_dangling_reference() const
 void module::finalize(context& ctx)
 {
+    const bool trace = enabled(MIGRAPHX_TRACE_FINALIZE{});
    for(auto ins : iterator_for(*this))
    {
+        if(trace)
+        {
+            std::cout << "Finalize: ";
+            this->debug_print(ins);
+        }
        ins->finalize(ctx);
        for(const auto& smod : ins->module_inputs())
        {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -131,6 +131,7 @@ add_library(migraphx_gpu
    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
+    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/stringutils.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace gen {
+static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
+{
+    // If all inputs are half then only use half2
+    if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
+           return s.type() == shape::half_type;
+       }))
+        return {2};
+    return {4, 2};
+}
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    auto sizes = vector_sizes(inputs);
+    std::vector<std::size_t> max_vec_size;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(max_vec_size),
+                   [&](const auto& input) -> std::size_t {
+                       auto stride = input.strides()[axis];
+                       auto len    = input.lens()[axis];
+                       if(stride != 0 and stride != 1)
+                           return 1;
+                       if(len == 1 and input.elements() > sizes.front())
+                           return sizes.front();
+                       auto it = std::find_if(
+                           sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                       if(it != sizes.end())
+                           return *it;
+                       return 1;
+                   });
+    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
+}
+std::string vectorize::str() const
+{
+    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
+}
+preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
+{
+    const std::size_t max_lds_bytes = 4096;
+    std::vector<bool> result;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(result),
+                   [&](const shape& input) { return input.strides()[axis] == 0; });
+    auto bytes = std::inner_product(inputs.begin(),
+                                    inputs.end(),
+                                    result.begin(),
+                                    std::size_t{0},
+                                    std::plus<>{},
+                                    [](const shape& s, bool b) -> std::size_t {
+                                        if(b)
+                                            return s.bytes();
+                                        return 0;
+                                    });
+    if(bytes < max_lds_bytes)
+        return {result};
+    // TODO: Try to partially preload items
+    std::fill(result.begin(), result.end(), false);
+    return {result};
+}
+std::string preload::str() const
+{
+    std::vector<std::string> bool_strs;
+    std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
+        if(b)
+            return "true";
+        return "false";
+    });
+    return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
+}
+bool preload::is_preloading() const
+{
+    return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
+}
+std::size_t find_fast_axis(const std::vector<shape>& inputs)
+{
+    auto permutation = find_permutation(inputs);
+    auto it          = std::max_element(permutation.begin(), permutation.end());
+    return it - permutation.begin();
+}
+std::string make_transformer_args(std::vector<std::string> transformers)
+{
+    return join_strings(std::move(transformers), ", ");
+}
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -119,8 +119,21 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    };
 }
+std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+{
+    size_t block_size = 128;
+    while(block_size <= max_block_size and block_size <= n)
+        block_size *= 2;
+    return block_size / 2;
+}
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
+    assert(options.global > 0);
+    assert(options.local > 0);
+    assert(not options.inputs.empty());
+    assert(options.inputs.size() == options.virtual_inputs.size() or
+           options.virtual_inputs.empty());
    std::vector<src_file> srcs;
    std::transform(migraphx_kernels().begin(),
                   migraphx_kernels().end(),

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -965,7 +965,7 @@ struct find_gemm_pointwise
        inputs.pop_back();
        inputs.push_back(c_ins);
-        inputs.push_back(gemm_ins->inputs().back());
+        inputs.push_back(ins->inputs().back());
        gemm.beta = 1;
        m.replace_instruction(ins, gemm, inputs);

--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -34,6 +34,10 @@ struct code_object_op
                    f(self.output, "output"));
    }
+    value attributes() const { return {{"group", group()}}; }
+    std::string group() const { return "gpu::code_object::" + symbol_name; }
    std::string name() const { return "gpu::code_object"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+#include <migraphx/config.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct shape;
+namespace gpu {
+namespace gen {
+struct vectorize
+{
+    std::size_t size = 1;
+    std::size_t axis = 0;
+    static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
+    std::string str() const;
+};
+struct preload
+{
+    std::vector<bool> args = {};
+    static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
+    bool is_preloading() const;
+    std::string str() const;
+};
+std::size_t find_fast_axis(const std::vector<shape>& inputs);
+std::string make_transformer_args(std::vector<std::string> transformers);
+template <class... Ts>
+std::string make_transformer_args(Ts... xs)
+{
+    return make_transformer_args({xs.str()...});
+}
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -46,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
 operation compile_hip_code_object(const std::string& content, hip_compile_options options);
+std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -2,6 +2,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
 #include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/gpu/miopen.hpp>
@@ -14,6 +15,7 @@ struct context;
 struct miopen_quant_convolution
 {
    op::quant_convolution op;
+    bool int8_x4_format = false;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
    miopenHandle_t handle = nullptr;
@@ -22,7 +24,8 @@ struct miopen_quant_convolution
    static auto reflect(Self& self, F f)
    {
        // TODO: Add algo
-        return op::quant_convolution::reflect(self.op, f);
+        return pack_join(migraphx::reflect(self.op, f),
+                         pack(f(self.int8_x4_format, "int8_x4_format")));
    }
    std::string name() const { return "gpu::quant_convolution"; }

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
@@ -17,6 +18,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+using namespace migraphx::gpu::gen; // NOLINT
 static const char* const pointwise_kernel = R"__migraphx__(
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/pointwise.hpp>
@@ -30,7 +33,7 @@ extern "C" {
 __global__ void ${kernel}(${params}) 
 {
    auto idx = make_index();
-    pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args});
+    pointwise(idx, ${transformers})(${lambda}, ${args});
 }
 }
@@ -62,75 +65,6 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        else
            return 1;
    }
-    static std::size_t find_fast_axis(const std::vector<shape>& inputs)
-    {
-        auto permutation = find_permutation(inputs);
-        auto it          = std::max_element(permutation.begin(), permutation.end());
-        return it - permutation.begin();
-    }
-    static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
-    {
-        const std::size_t max_lds_bytes = 4096;
-        std::vector<bool> result;
-        std::transform(inputs.begin(),
-                       inputs.end(),
-                       std::back_inserter(result),
-                       [&](const shape& input) { return input.strides()[axis] == 0; });
-        auto bytes = std::inner_product(inputs.begin(),
-                                        inputs.end(),
-                                        result.begin(),
-                                        std::size_t{0},
-                                        std::plus<>{},
-                                        [](const shape& s, bool b) -> std::size_t {
-                                            if(b)
-                                                return s.bytes();
-                                            return 0;
-                                        });
-        if(bytes < max_lds_bytes)
-            return result;
-        // TODO: Try to partially preload items
-        std::fill(result.begin(), result.end(), false);
-        return result;
-    }
-    static std::string preload_str(const std::vector<bool>& bs)
-    {
-        std::vector<std::string> bool_strs;
-        std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
-            if(b)
-                return "true";
-            return "false";
-        });
-        return "false, " + join_strings(bool_strs, ", ");
-    }
-    static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
-    {
-        // If all inputs is half then only use half2
-        if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
-               return s.type() == shape::half_type;
-           }))
-            return {2};
-        return {4, 2};
-    }
-    static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
-    {
-        auto sizes = vector_sizes(inputs);
-        std::vector<std::size_t> max_vec_size;
-        std::transform(inputs.begin(),
-                       inputs.end(),
-                       std::back_inserter(max_vec_size),
-                       [&](const auto& input) -> std::size_t {
-                           auto stride = input.strides()[axis];
-                           auto len    = input.lens()[axis];
-                           if(stride != 0 and stride != 1)
-                               return 1;
-                           auto it = std::find_if(
-                               sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
-                           if(it != sizes.end())
-                               return *it;
-                           return 1;
-                       });
-        return *std::min_element(max_vec_size.begin(), max_vec_size.end());
-    }
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
        hip_compile_options options;
@@ -139,23 +73,20 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        options.virtual_inputs = reduce_dims(inputs);
        options.params         = "-Wno-float-equal";
        auto axis              = find_fast_axis(options.virtual_inputs);
-        auto vec_size          = vectorize_elements(axis, options.virtual_inputs);
+        auto vec               = vectorize::elements(axis, options.virtual_inputs);
-        auto preloads          = preload(axis, options.virtual_inputs);
+        auto preloads          = preload::broadcasts(axis, options.virtual_inputs);
-        auto is_preloading =
+        options.kernel_name    = v.get("kernel", "kernel");
-            std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
+        options.set_launch_params(
-        options.kernel_name = v.get("kernel", "kernel");
+            v,
-        options.set_launch_params(v,
+            compute_global_for(ctx,
-                                  compute_global_for(ctx,
+                               options.output.elements() / vec.size,
-                                                     options.output.elements() / vec_size,
+                               oversubscribe_if(not preloads.is_preloading())));
-                                                     oversubscribe_if(not is_preloading)));
        auto src = interpolate_string(pointwise_kernel,
                                      {{"kernel", options.kernel_name},
                                       {"params", enum_params(inputs.size(), "void * private_p")},
                                       {"args", enum_params(inputs.size(), "private_p")},
                                       {"lambda", v.at("lambda").to<std::string>()},
-                                       {"vec_size", std::to_string(vec_size)},
+                                       {"transformers", make_transformer_args(preloads, vec)},
-                                       {"axis", std::to_string(axis)},
-                                       {"preloads", preload_str(preloads)},
                                       {"preamble", v.get("preamble", std::string{})}});
        return compile_hip_code_object(src, options);
    }

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
@@ -16,9 +17,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+using namespace migraphx::gpu::gen; // NOLINT
 static const char* const simple_reduce_kernel = R"__migraphx__(
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/vectorize.hpp>
 #include <args.hpp>
 namespace migraphx {
@@ -26,9 +30,10 @@ namespace migraphx {
 ${preamble}
 extern "C" {
-__global__ void kernel(void* input_p, void* output_p) 
+__global__ void reduce_kernel(void* input_p, void* output_p) 
 {
-    make_tensors()(input_p, output_p)([](auto input, auto output) {
+    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
        simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
    });
@@ -40,14 +45,6 @@ __global__ void kernel(void* input_p, void* output_p)
 )__migraphx__";
-constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024)
-{
-    size_t block_size = 128;
-    while(block_size <= max_block_size and block_size <= n)
-        block_size *= 2;
-    return block_size / 2;
-}
 static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
 {
    return inputs.front().elements() / inputs.back().elements();
@@ -101,32 +98,42 @@ struct reduce_compiler : compiler<reduce_compiler>
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
        hip_compile_options options;
-        auto reduce_elements = get_reduce_elements(inputs);
+        options.inputs         = inputs;
-        auto algo            = v.get("algo", get_reduce_algo(inputs));
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        auto faxis             = find_fast_axis({options.virtual_inputs.front()});
+        vectorize vec{};
+        // Vectorize if the axis is a reduction axis
+        if(options.virtual_inputs.back().lens()[faxis] == 1)
+        {
+            vec = vectorize::elements(faxis, options.virtual_inputs);
+        }
+        auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
+        auto nelements = options.virtual_inputs.back().elements();
+        auto algo      = v.get("algo", get_reduce_algo(options.virtual_inputs));
        if(algo == "block")
        {
-            auto block_size = compute_block_size(reduce_elements, 256);
+            auto block_size = compute_block_size(relements, 256);
            options.set_launch_params(
-                v, compute_global_for(ctx, inputs.back().elements() * block_size, 256), block_size);
+                v, compute_global_for(ctx, nelements * block_size, 256), block_size);
        }
        else if(algo == "lane")
        {
-            options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements(), 256));
+            options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
        }
        else
        {
            MIGRAPHX_THROW("Unknown reduce algo: " + algo);
        }
-        options.inputs         = inputs;
+        options.kernel_name  = "reduce_kernel";
-        options.output         = inputs.back();
+        std::string identity = "[](auto x) { return x; }";
-        options.virtual_inputs = reduce_dims(inputs);
+        auto src             = interpolate_string(simple_reduce_kernel,
-        std::string identity   = "[](auto x) { return x; }";
-        auto src               = interpolate_string(simple_reduce_kernel,
                                      {{"reduction", v.at("reduction").to<std::string>()},
                                       {"init", v.get("init", std::string{"0"})},
                                       {"read", v.get("read", identity)},
                                       {"write", v.get("write", identity)},
                                       {"algo", algo},
+                                       {"transformers", make_transformer_args(vec)},
                                       {"preamble", v.get("preamble", std::string{})}});
        options.params += "-Wno-float-equal";
        return compile_hip_code_object(src, options);

--- a/src/targets/gpu/kernel.cpp
+++ b/src/targets/gpu/kernel.cpp
@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun,
                   void* kernargs,
                   std::size_t size)
 {
+    assert(global > 0);
+    assert(local > 0);
    void* config[] = {
 // HIP_LAUNCH_PARAM_* are macros that do horrible things
 #ifdef MIGRAPHX_USE_CLANG_TIDY

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -163,9 +163,12 @@ struct block
        __device__ auto reduce(Op op, T init, Read read) const
        {
            return sliced(slicer, [=](auto x, auto... xs) {
-                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
+                return vec_reduce(block_reduce(idx,
-                    return read(x[j], xs[j]...);
+                                               op,
-                });
+                                               init,
+                                               x.get_shape().elements(),
+                                               [&](auto j) { return read(x[j], xs[j]...); }),
+                                  op);
            });
        }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -146,5 +146,19 @@ constexpr auto vec_packed_transform(Ts... xs)
    };
 }
+template <class T, class Op>
+constexpr auto vec_reduce(T x, Op op)
+{
+    if constexpr(vec_size<T>() < 2)
+        return x;
+    else
+    {
+        vec_type<T> result = x[0];
+        for(int i = 1; i < vec_size<T>(); i++)
+            result = op(result, x[i]);
+        return result;
+    }
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -213,7 +213,9 @@ template <index_int N, index_int Axis, class T>
 __device__ __host__ auto vectorize_tensor(T x)
 {
    constexpr auto shape = get_shape_c<T>{};
-    if constexpr(shape.strides[Axis] == 0)
+    if constexpr(shape.lens[Axis] == 1)
+        return x;
+    else if constexpr(shape.strides[Axis] == 0)
        return tensor_step<N>(x, _c<Axis>);
    else
        return as_vec<N>(x, _c<Axis>);

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -365,8 +365,22 @@ struct miopen_apply
    {
        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
-            auto conv = miopen_quant_convolution{op, make_conv(op)};
+            shape ws;
-            auto ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            miopen_quant_convolution conv;
+            auto compile_quant_conv_with_format = [&](bool format) {
+                conv = miopen_quant_convolution{op, format, make_conv(op)};
+                ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            };
+            try
+            {
+                compile_quant_conv_with_format(int8_x4_format);
+            }
+            catch(migraphx::exception&)
+            {
+                // In case no solver supports the default format, retry using the other format.
+                compile_quant_conv_with_format(!int8_x4_format);
+            }
            auto args      = ins->inputs();
            auto workspace = insert_allocation(ins, ws, "workspace");

--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -118,7 +118,7 @@ void pack_int8_args::apply(module& m) const
            assert(val.contains("int8_x4_format"));
            if(not val.at("int8_x4_format").to<bool>())
            {
-                return;
+                continue;
            }
            auto inputs = ins->inputs();
            auto lens   = inputs.at(0)->get_shape().lens();
@@ -156,6 +156,12 @@ void pack_int8_args::apply(module& m) const
        }
        else if(ins->name() == "gpu::quant_convolution")
        {
+            auto val = ins->get_operator().to_value();
+            if(not val.at("int8_x4_format").to<bool>())
+            {
+                continue;
+            }
            auto inputs   = ins->inputs();
            auto packed_x = m.insert_instruction(
                ins,

--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
@@ -16,8 +16,8 @@ argument miopen_quant_convolution::compute(context& ctx,
                                           const shape& output_shape,
                                           const std::vector<argument>& args) const
 {
-    auto x_desc = make_tensor(args[0].get_shape(), true);
+    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
-    auto w_desc = make_tensor(args[1].get_shape(), true);
+    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
    auto y_desc = make_tensor(output_shape);
    float alpha = 1;
@@ -49,8 +49,8 @@ shape miopen_quant_convolution::compile(context& ctx,
                                        std::vector<shape> inputs)
 {
    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], true);
+    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], true);
+    auto w_desc = make_tensor(inputs[1], int8_x4_format);
    auto y_desc = make_tensor(output_shape);
    std::size_t workspace_size = 0;
@@ -62,8 +62,15 @@ shape miopen_quant_convolution::compile(context& ctx,
                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};
-    auto arg_vec4_x = to_gpu(generate_argument(pack_int8_shape(inputs[0])));
+    auto x_shape = inputs[0];
-    auto arg_vec4_w = to_gpu(generate_argument(pack_int8_shape(inputs[1])));
+    auto w_shape = inputs[1];
+    if(int8_x4_format)
+    {
+        x_shape = pack_int8_shape(x_shape);
+        w_shape = pack_int8_shape(w_shape);
+    }
+    auto arg_vec4_x = to_gpu(generate_argument(x_shape));
+    auto arg_vec4_w = to_gpu(generate_argument(w_shape));
    auto y          = allocate_gpu(output_shape);
    auto workspace  = allocate_gpu(workspace_shape);