Merge branch 'develop' into scheduler

213930d8 · Paul · 20d11e64 · 3499ec7d · 213930d8 · 213930d8
Commit 213930d8 authored Mar 07, 2019 by Paul
20 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -37,6 +37,8 @@ target_include_directories(migraphx SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLU
 set(PACKAGE_DEPENDS)
 add_subdirectory(onnx)
+add_subdirectory(tf)
 add_subdirectory(py)
 add_subdirectory(targets/cpu)
 if(MIGRAPHX_ENABLE_GPU)

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -826,10 +826,22 @@ struct dot
        const shape& b = inputs.at(1);
        auto t         = a.type();
-        if(a.lens()[1] != b.lens()[0])
+        // according to the specification of the numpy.matmul()
+        // inputs with the shape dims more than 2 are acceptable
+        // as long as dim values are the same in the two inputs
+        if(!std::equal(a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2))
+        {
+            MIGRAPHX_THROW("DOT: dim values mismatch");
+        }
+        std::size_t dim_0 = a.lens().size() - 2;
+        std::size_t dim_1 = a.lens().size() - 1;
+        if(a.lens()[dim_1] != b.lens()[dim_0])
            MIGRAPHX_THROW("Inner dimensions do not match: {" + to_string_range(a.lens()) +
                           "} x {" + to_string_range(b.lens()) + "}");
-        return {t, {a.lens()[0], b.lens()[1]}};
+        auto out_lens   = a.lens();
+        out_lens[dim_1] = b.lens()[dim_1];
+        return {t, out_lens};
    }
 };
@@ -938,6 +950,22 @@ struct softmax
    }
 };
+struct logsoftmax
+{
+    int axis = 1;
+    std::string name() const { return "logsoftmax"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        if(axis < 0 || axis > inputs[0].lens().size())
+        {
+            MIGRAPHX_THROW("LogSoftMax: input axis value " + std::to_string(axis) +
+                           " is out of range");
+        }
+        return inputs.at(0);
+    }
+};
 struct flatten
 {
    uint64_t axis = 0;

--- a/src/include/migraphx/tf.hpp
+++ b/src/include/migraphx/tf.hpp
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_TF_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_TF_HPP
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct unknown
+{
+    std::string op;
+    std::string name() const { return "unknown:" + op; }
+    shape compute_shape(std::vector<shape> input) const
+    {
+        if(input.empty())
+            return {};
+        else
+            return input.front();
+    }
+    friend std::ostream& operator<<(std::ostream& os, const unknown& x)
+    {
+        os << x.name();
+        return os;
+    }
+};
+/// Create a program from an onnx file
+program parse_tf(const std::string& name, bool is_nhwc);
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -79,6 +79,7 @@ struct onnx_parser
        add_mem_op("Gemm", &onnx_parser::parse_gemm);
        add_mem_op("BatchNormalization", &onnx_parser::parse_batchnorm);
        add_mem_op("Softmax", &onnx_parser::parse_softmax);
+        add_mem_op("LogSoftmax", &onnx_parser::parse_logsoftmax);
        add_mem_op("Squeeze", &onnx_parser::parse_squeeze);
        add_mem_op("Unsqueeze", &onnx_parser::parse_unsqueeze);
        add_mem_op("Slice", &onnx_parser::parse_slice);
@@ -228,6 +229,19 @@ struct onnx_parser
        return prog.add_instruction(op::reshape{{long(dims[0]), long(dims[1])}}, s);
    }
+    instruction_ref parse_logsoftmax(const std::string&,
+                                     const attribute_map& attributes,
+                                     std::vector<instruction_ref> args)
+    {
+        int axis = 1;
+        if(contains(attributes, "axis"))
+        {
+            axis = parse_value(attributes.at("axis")).at<int>();
+        }
+        return prog.add_instruction(op::logsoftmax{axis}, std::move(args));
+    }
    instruction_ref
    parse_conv(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
@@ -472,7 +486,11 @@ struct onnx_parser
            transb = parse_value(attributes.at("transB")).at<bool>();
        }
-        std::vector<int64_t> perm = {1, 0};
+        std::vector<int64_t> perm(args[0]->get_shape().lens().size());
+        std::iota(perm.begin(), perm.end(), int64_t{0});
+        // swap the last two elements
+        std::swap(*perm.rbegin(), *(perm.rbegin() + 1));
        auto l1 = (transa) ? prog.add_instruction(op::transpose{perm}, args[0]) : args[0];
        auto l2 = (transb) ? prog.add_instruction(op::transpose{perm}, args[1]) : args[1];
        if(args.size() == 3)
@@ -493,9 +511,7 @@ struct onnx_parser
            }
        }
-        auto dot_res = prog.add_instruction(op::dot{alpha, beta}, l1, l2);
+        return prog.add_instruction(op::dot{alpha, beta}, l1, l2);
-        return dot_res;
    }
    instruction_ref
@@ -1149,9 +1165,9 @@ struct onnx_parser
                instructions[name] = prog.add_parameter(name, s);
            }
        }
-        for(auto&& p : nodes)
+        for(auto&& output : graph.output())
        {
-            this->parse_node(p.first);
+            this->parse_node(output.name());
        }
    }

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -987,15 +987,12 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
        auto spph  = prog.insert_instruction(ins, op::squeeze{{0}}, pph);
        auto pphi  = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, spph);
        pphi_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_shape}, pphi);
-        pphi_brcst = prog.insert_instruction(ins, op::contiguous{}, pphi_brcst);
        auto ppho  = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, spph);
        ppho_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_shape}, ppho);
-        ppho_brcst = prog.insert_instruction(ins, op::contiguous{}, ppho_brcst);
        auto pphf  = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, spph);
        pphf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_shape}, pphf);
-        pphf_brcst = prog.insert_instruction(ins, op::contiguous{}, pphf_brcst);
    }
    for(long i = 0; i < seq_len; ++i)

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
 #include <migraphx/cpu/gemm.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/requires.hpp>
+#include <migraphx/shape_for_each.hpp>
 #include <blaze/math/CustomMatrix.h>
 namespace migraphx {
@@ -14,10 +15,13 @@ template <class T>
 static auto make_mat(tensor_view<T> x)
 {
    const auto& s = x.get_shape();
-    assert(s.lens().size() == 2);
+    // assert(s.lens().size() == 2);
+    std::size_t n_dims = s.lens().size();
+    std::size_t dim_0  = n_dims - 2;
+    std::size_t dim_1  = n_dims - 1;
    if(s.transposed())
-        return matrix<T>{x.data(), s.lens()[1], s.lens()[0], s.strides()[1]};
+        return matrix<T>{x.data(), s.lens()[dim_1], s.lens()[dim_0], s.strides()[dim_1]};
-    return matrix<T>{x.data(), s.lens()[0], s.lens()[1], s.strides()[0]};
+    return matrix<T>{x.data(), s.lens()[dim_0], s.lens()[dim_1], s.strides()[dim_0]};
 }
 template <class T, class F>
@@ -64,18 +68,24 @@ void migemm_impl(tensor_view<T> cmat,
                 float beta,
                 std::false_type)
 {
-    auto m = cmat.get_shape().lens()[0];
+    std::size_t n_dims = cmat.get_shape().lens().size();
-    auto n = cmat.get_shape().lens()[1];
+    std::size_t dim_0  = n_dims - 2;
-    auto k = amat.get_shape().lens()[1];
+    std::size_t dim_1  = n_dims - 1;
+    auto k             = amat.get_shape().lens()[dim_1];
-    assert(amat.get_shape().lens()[1] == bmat.get_shape().lens()[0]);
+    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
-    assert(m == amat.get_shape().lens()[0]);
+    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
-    assert(n == bmat.get_shape().lens()[1]);
+    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
-    dfor(m, n)([&](auto ii, auto jj) {
+    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
-        double s = cmat(ii, jj) * beta;
+        auto a_idx = c_idx;
-        dfor(k)([&](auto kk) { s += amat(ii, kk) * bmat(kk, jj); });
+        auto b_idx = c_idx;
-        cmat(ii, jj) = alpha * s;
+        double s   = 0.0;
+        dfor(k)([&](auto kk) {
+            a_idx[dim_1] = b_idx[dim_0] = kk;
+            s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
+        });
+        cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
    });
 }
@@ -83,7 +93,18 @@ template <class T>
 void migemm_impl(
    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
 {
-    migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+    auto lens = amat.get_shape().lens();
+    bool batch_mul =
+        std::accumulate(lens.begin(), lens.end(), std::size_t{1}, std::multiplies<std::size_t>()) ==
+        (*lens.rbegin()) * (*(lens.rbegin() + 1));
+    if(batch_mul)
+    {
+        migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+    }
+    else
+    {
+        migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
+    }
 }
 void migemm(

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -613,6 +613,75 @@ struct softmax2d
    }
 };
+struct cpu_logsoftmax
+{
+    op::logsoftmax op;
+    std::string name() const { return "cpu::logsoftmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    template <typename T>
+    std::size_t compute_batch_index(const T& idx, shape& batch_shape, int axis) const
+    {
+        if(axis == 0)
+        {
+            return 0;
+        }
+        else
+        {
+            std::vector<std::size_t> batch_idx(idx.begin(), idx.begin() + axis);
+            return batch_shape.index(batch_idx.begin(), batch_idx.end());
+        }
+    }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto lens = output_shape.lens();
+        std::vector<std::size_t> batch_lens{};
+        if(op.axis == 0)
+        {
+            batch_lens.push_back(1);
+        }
+        else
+        {
+            batch_lens.insert(batch_lens.begin(), lens.begin(), lens.begin() + op.axis);
+        }
+        shape batch_shape{migraphx::shape::uint32_type, batch_lens};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            std::vector<value_type> batch_max(batch_shape.elements(),
+                                              std::numeric_limits<value_type>::lowest());
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
+            });
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
+            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
+            });
+            for(std::size_t i = 0; i < batch_sum.size(); ++i)
+            {
+                batch_sum[i] = std::log(batch_sum[i]);
+            }
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) -= batch_sum[index];
+            });
+        });
+        return result;
+    }
+};
 struct add_op
 {
    std::string name() const { return "add"; }
@@ -723,6 +792,7 @@ struct cpu_apply
        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
+        apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>();
        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -14,8 +14,9 @@ std::string target::name() const { return "cpu"; }
 std::vector<pass> target::get_passes(migraphx::context&) const
 {
-    return {auto_contiguous{},
+    return {rewrite_rnn{},
-            rewrite_rnn{},
+            dead_code_elimination{},
+            auto_contiguous{},
            dead_code_elimination{},
            lowering{},
            dead_code_elimination{}};

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -26,6 +26,7 @@ add_library(migraphx_device
    device/atan.cpp
    device/add_relu.cpp
    device/contiguous.cpp
+    device/logsoftmax.cpp
    device/mul.cpp
    device/concat.cpp
    device/pad.cpp
@@ -48,6 +49,7 @@ add_library(migraphx_gpu
    pooling.cpp
    convolution.cpp
    softmax.cpp
+    logsoftmax.cpp
    contiguous.cpp
    concat.cpp
    relu.cpp

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument logsoftmax(hipStream_t stream,
+                    const migraphx::shape& output_shape,
+                    std::vector<migraphx::argument> args,
+                    int axis)
+{
+    auto lens              = output_shape.lens();
+    std::size_t batch_size = std::accumulate(
+        lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<std::size_t>());
+    std::size_t n_dims = std::accumulate(
+        lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+    migraphx::shape comp_shape{output_shape.type(), {batch_size, n_dims}};
+    visit_all(args.back(), args.front())([&](auto output, auto input) {
+        const auto* input_ptr = device_cast(input.data());
+        auto* output_ptr      = device_cast(output.data());
+        // each thread is for one item in the batch
+        gs_launch(stream, batch_size)([=](auto i) {
+            std::size_t row_start = i * n_dims;
+            // get max
+            auto batch_max = input_ptr[row_start];
+            for(std::size_t j = 1; j < n_dims; ++j)
+            {
+                auto ind  = row_start + j;
+                batch_max = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[ind]));
+            }
+            for(std::size_t j = 0; j < n_dims; ++j)
+            {
+                auto ind        = row_start + j;
+                output_ptr[ind] = input_ptr[ind] - batch_max;
+            }
+            auto batch_sum = ::exp(to_hip_type(output_ptr[row_start]));
+            for(std::size_t j = 1; j < n_dims; ++j)
+            {
+                auto ind = row_start + j;
+                batch_sum += ::exp(to_hip_type(output_ptr[ind]));
+            }
+            batch_sum = ::log(to_hip_type(batch_sum));
+            for(std::size_t j = 0; j < n_dims; ++j)
+            {
+                auto ind = row_start + j;
+                output_ptr[ind] -= batch_sum;
+            }
+        });
+    });
+    return args.back();
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
@@ -5,6 +5,30 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+template <class... Ts>
+void generic_rocblas_batched_gemm(shape::as<float>, Ts&&... xs)
+{
+    rocblas_sgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+template <class... Ts>
+void generic_rocblas_batched_gemm(shape::as<double>, Ts&&... xs)
+{
+    rocblas_dgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+template <class... Ts>
+void generic_rocblas_batched_gemm(shape::as<half>, Ts&&... xs)
+{
+    rocblas_hgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+template <class T, class... Ts>
+void generic_rocblas_batched_gemm(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_BATCHED_GEMM: type unsupported by rocblas");
+}
 template <class... Ts>
 void generic_rocblas_gemm(shape::as<float>, Ts&&... xs)
 {
@@ -26,7 +50,7 @@ void generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
 template <class T, class... Ts>
 void generic_rocblas_gemm(shape::as<T>, Ts&&...)
 {
-    MIGRAPHX_THROW("Type unsupported by rocblas");
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_GEMM: type unsupported by rocblas");
 }
 template <class T>
@@ -73,35 +97,45 @@ argument miopen_gemm::compute(context& ctx,
                              const shape& output_shape,
                              const std::vector<argument>& args) const
 {
-    float alpha     = 1.0f;
+    float alpha        = 1.0f;
-    float beta      = 0.0f;
+    float beta         = 0.0f;
-    bool transa     = args[0].get_shape().transposed();
+    bool transa        = args[0].get_shape().transposed();
-    bool transb     = args[1].get_shape().transposed();
+    bool transb        = args[1].get_shape().transposed();
-    rocblas_int lda = args[0].get_shape().strides()[transa ? 1 : 0];
+    std::size_t n_dims = args[0].get_shape().lens().size();
-    rocblas_int ldb = args[1].get_shape().strides()[transb ? 1 : 0];
+    std::size_t dim_0  = n_dims - 2;
-    rocblas_int ldc = args[2].get_shape().strides()[0];
+    std::size_t dim_1  = n_dims - 1;
-    rocblas_int m   = output_shape.lens()[0];
+    rocblas_int lda    = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
-    rocblas_int n   = output_shape.lens()[1];
+    rocblas_int ldb    = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
-    rocblas_int k   = args[0].get_shape().lens()[1];
+    rocblas_int ldc    = args[2].get_shape().strides()[dim_0];
+    auto out_lens      = output_shape.lens();
+    rocblas_int m      = out_lens[dim_0];
+    rocblas_int n      = out_lens[dim_1];
+    rocblas_int k      = args[0].get_shape().lens()[dim_1];
+    auto batch_num     = std::accumulate(
+        out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
    output_shape.visit_type([&](auto as) {
        auto alpha_r    = to_rocblas_type(as(alpha));
        auto beta_r     = to_rocblas_type(as(beta));
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
-        generic_rocblas_gemm(as,
+        generic_rocblas_batched_gemm(as,
-                             ctx.get_stream().get_rocblas(),
+                                     ctx.get_stream().get_rocblas(),
-                             transb ? rocblas_operation_transpose : rocblas_operation_none,
+                                     transb ? rocblas_operation_transpose : rocblas_operation_none,
-                             transa ? rocblas_operation_transpose : rocblas_operation_none,
+                                     transa ? rocblas_operation_transpose : rocblas_operation_none,
-                             n,
+                                     n,
-                             m,
+                                     m,
-                             k,
+                                     k,
-                             &alpha_r,
+                                     &alpha_r,
-                             to_pointer(args[1]),
+                                     to_pointer(args[1]),
-                             ldb,
+                                     ldb,
-                             to_pointer(args[0]),
+                                     k * n,
-                             lda,
+                                     to_pointer(args[0]),
-                             &beta_r,
+                                     lda,
-                             to_pointer(args[2]),
+                                     m * k,
-                             ldc);
+                                     &beta_r,
+                                     to_pointer(args[2]),
+                                     ldc,
+                                     m * n,
+                                     batch_num);
    });

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_LOGSOFTMAX_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument logsoftmax(hipStream_t stream,
+                    const migraphx::shape& output_shape,
+                    std::vector<migraphx::argument> args,
+                    int axis);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/add.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct hip_logsoftmax
+{
+    op::logsoftmax op;
+    std::string name() const { return "gpu::logsoftmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
+#include <migraphx/gpu/logsoftmax.hpp>
+#include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).standard();
+    return op.compute_shape({inputs.at(0)});
+}
+argument hip_logsoftmax::compute(context& ctx,
+                                 const shape& output_shape,
+                                 const std::vector<argument>& args) const
+{
+    return device::logsoftmax(ctx.get_stream().get(), output_shape, args, op.axis);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -21,6 +21,7 @@
 #include <migraphx/gpu/leaky_relu.hpp>
 #include <migraphx/gpu/elu.hpp>
 #include <migraphx/gpu/softmax.hpp>
+#include <migraphx/gpu/logsoftmax.hpp>
 #include <migraphx/gpu/add.hpp>
 #include <migraphx/gpu/sub.hpp>
 #include <migraphx/gpu/exp.hpp>
@@ -97,6 +98,7 @@ struct miopen_apply
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<miopen_softmax, op::softmax>("softmax");
+        add_extend_op<hip_logsoftmax, op::logsoftmax>("logsoftmax");
        add_extend_op<hip_gather, op::gather>("gather");
        add_extend_op<hip_pad, op::pad>("pad");

--- a/src/tf/CMakeLists.txt
+++ b/src/tf/CMakeLists.txt
+find_package(Protobuf REQUIRED)
+protobuf_generate_cpp(
+    PROTO_SRCS PROTO_HDRS 
+    graph.proto
+    node_def.proto
+    attr_value.proto
+    tensor.proto
+    tensor_shape.proto
+    resource_handle.proto
+    types.proto
+    function.proto
+    op_def.proto
+    versions.proto
+)
+add_library(tf-proto STATIC ${PROTO_SRCS})
+target_include_directories(tf-proto SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${PROTOBUF_INCLUDE_DIR})
+target_compile_options(tf-proto PRIVATE -w)
+target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
+set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)
+add_library(migraphx_tf tf.cpp)
+set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)
+rocm_clang_tidy_check(migraphx_tf)
+target_link_libraries(migraphx_tf PRIVATE tf-proto)
+target_link_libraries(migraphx_tf PUBLIC migraphx)
+rocm_install_targets(
+  TARGETS migraphx_tf
+)
+add_executable(read_tf read_tf.cpp)
+rocm_clang_tidy_check(read_tf)
+target_link_libraries(read_tf migraphx_tf)
+if(MIGRAPHX_ENABLE_GPU)
+add_executable(verify_tf verify_tf.cpp)
+rocm_clang_tidy_check(verify_tf)
+target_link_libraries(verify_tf migraphx_tf migraphx_cpu migraphx_gpu)
+add_executable(perf_tf perf_tf.cpp)
+rocm_clang_tidy_check(perf_tf)
+target_link_libraries(perf_tf migraphx_tf migraphx_cpu migraphx_gpu)
+endif()
--- a/src/tf/attr_value.proto
+++ b/src/tf/attr_value.proto
+syntax = "proto3";
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "AttrValueProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "tensor.proto";
+import "tensor_shape.proto";
+import "types.proto";
+// Protocol buffer representing the value for an attr used to configure an Op.
+// Comment indicates the corresponding attr type.  Only the field matching the
+// attr type may be filled.
+message AttrValue {
+  // LINT.IfChange
+  message ListValue {
+    repeated bytes s = 2;                        // "list(string)"
+    repeated int64 i = 3 [packed = true];        // "list(int)"
+    repeated float f = 4 [packed = true];        // "list(float)"
+    repeated bool b = 5 [packed = true];         // "list(bool)"
+    repeated DataType type = 6 [packed = true];  // "list(type)"
+    repeated TensorShapeProto shape = 7;         // "list(shape)"
+    repeated TensorProto tensor = 8;             // "list(tensor)"
+    repeated NameAttrList func = 9;              // "list(attr)"
+  }
+  // LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/c/c_api.cc)
+  oneof value {
+    bytes s = 2;                 // "string"
+    int64 i = 3;                 // "int"
+    float f = 4;                 // "float"
+    bool b = 5;                  // "bool"
+    DataType type = 6;           // "type"
+    TensorShapeProto shape = 7;  // "shape"
+    TensorProto tensor = 8;      // "tensor"
+    ListValue list = 1;          // any "list(...)"
+    // "func" represents a function. func.name is a function's name or
+    // a primitive op's name. func.attr.first is the name of an attr
+    // defined for that function. func.attr.second is the value for
+    // that attr in the instantiation.
+    NameAttrList func = 10;
+    // This is a placeholder only used in nodes defined inside a
+    // function.  It indicates the attr value will be supplied when
+    // the function is instantiated.  For example, let us suppose a
+    // node "N" in function "FN". "N" has an attr "A" with value
+    // placeholder = "foo". When FN is instantiated with attr "foo"
+    // set to "bar", the instantiated node N's attr A will have been
+    // given the value "bar".
+    string placeholder = 9;
+  }
+}
+// A list of attr names and their values. The whole list is attached
+// with a string name.  E.g., MatMul[T=float].
+message NameAttrList {
+  string name = 1;
+  map<string, AttrValue> attr = 2;
+}
--- a/src/tf/function.proto
+++ b/src/tf/function.proto
+syntax = "proto3";
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "FunctionProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "attr_value.proto";
+import "node_def.proto";
+import "op_def.proto";
+// A library is a set of named functions.
+message FunctionDefLibrary {
+  repeated FunctionDef function = 1;
+  repeated GradientDef gradient = 2;
+}
+// A function can be instantiated when the runtime can bind every attr
+// with a value. When a GraphDef has a call to a function, it must
+// have binding for every attr defined in the signature.
+//
+// TODO(zhifengc):
+//   * device spec, etc.
+message FunctionDef {
+  // The definition of the function's name, arguments, return values,
+  // attrs etc.
+  OpDef signature = 1;
+  // Attributes specific to this function definition.
+  map<string, AttrValue> attr = 5;
+  // NOTE: field id 2 deleted on Jan 11, 2017, GraphDef version 21.
+  reserved 2;
+  // In both of the following fields, there is the need to specify an
+  // output that is used as either the input to another node (in
+  // `node_def`) or as a return value of the function (in `ret`).
+  // Unlike the NodeDefs in GraphDef, we need to be able to specify a
+  // list in some cases (instead of just single outputs).  Also, we
+  // need to be able to deal with lists of unknown length (so the
+  // output index may not be known at function definition time).  So
+  // we use the following format instead:
+  // * "fun_in" where "fun_in" is the name of a function input arg in
+  //   the `signature` field above.  This represents that input, whether
+  //   it is a single tensor or a list.
+  // * "fun_in:0" gives the first element of a function input arg (a
+  //   non-list input is considered a list of length 1 for these
+  //   purposes).
+  // * "node:out" where "node" is the name of a node in `node_def` and
+  //   "out" is the name one of its op's output arguments (the name
+  //   comes from the OpDef of the node's op). This represents that
+  //   node's output, whether it is a single tensor or a list.
+  //   Note: We enforce that an op's output arguments are never
+  //   renamed in the backwards-compatibility test.
+  // * "node:out:0" gives the first element of a node output arg (a
+  //   non-list output is considered a list of length 1 for these
+  //   purposes).
+  //
+  // NOT CURRENTLY SUPPORTED (but may be in the future):
+  // * "node:out:-1" gives last element in a node output list
+  // * "node:out:1:" gives a list with all but the first element in a
+  //   node output list
+  // * "node:out::-1" gives a list with all but the last element in a
+  //   node output list
+  // The body of the function.  Unlike the NodeDefs in a GraphDef, attrs
+  // may have values of type `placeholder` and the `input` field uses
+  // the "output" format above.
+  // By convention, "op" in node_def is resolved by consulting with a
+  // user-defined library first. If not resolved, "func" is assumed to
+  // be a builtin op.
+  repeated NodeDef node_def = 3;
+  // A mapping from the output arg names from `signature` to the
+  // outputs from `node_def` that should be returned by the function.
+  map<string, string> ret = 4;
+}
+// GradientDef defines the gradient function of a function defined in
+// a function library.
+//
+// A gradient function g (specified by gradient_func) for a function f
+// (specified by function_name) must follow the following:
+//
+// The function 'f' must be a numerical function which takes N inputs
+// and produces M outputs. Its gradient function 'g', which is a
+// function taking N + M inputs and produces N outputs.
+//
+// I.e. if we have
+//    (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),
+// then, g is
+//    (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,
+//                                      dL/dy1, dL/dy2, ..., dL/dy_M),
+// where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+// loss function). dL/dx_i is the partial derivative of L with respect
+// to x_i.
+message GradientDef {
+  string function_name = 1;  // The function name.
+  string gradient_func = 2;  // The gradient function's name.
+}
--- a/src/tf/graph.proto
+++ b/src/tf/graph.proto
+syntax = "proto3";
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "GraphProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "node_def.proto";
+import "function.proto";
+import "versions.proto";
+// Represents the graph of operations
+message GraphDef {
+  repeated NodeDef node = 1;
+  // Compatibility versions of the graph.  See core/public/version.h for version
+  // history.  The GraphDef version is distinct from the TensorFlow version, and
+  // each release of TensorFlow will support a range of GraphDef versions.
+  VersionDef versions = 4;
+  // Deprecated single version field; use versions above instead.  Since all
+  // GraphDef changes before "versions" was introduced were forward
+  // compatible, this field is entirely ignored.
+  int32 version = 3 [deprecated = true];
+  // EXPERIMENTAL. DO NOT USE OR DEPEND ON THIS YET.
+  //
+  // "library" provides user-defined functions.
+  //
+  // Naming:
+  //   * library.function.name are in a flat namespace.
+  //     NOTE: We may need to change it to be hierarchical to support
+  //     different orgs. E.g.,
+  //     { "/google/nn", { ... }},
+  //     { "/google/vision", { ... }}
+  //     { "/org_foo/module_bar", { ... }}
+  //     map<string, FunctionDefLib> named_lib;
+  //   * If node[i].op is the name of one function in "library",
+  //     node[i] is deemed as a function call. Otherwise, node[i].op
+  //     must be a primitive operation supported by the runtime.
+  //
+  //
+  // Function call semantics:
+  //
+  //   * The callee may start execution as soon as some of its inputs
+  //     are ready. The caller may want to use Tuple() mechanism to
+  //     ensure all inputs are ready in the same time.
+  //
+  //   * The consumer of return values may start executing as soon as
+  //     the return values the consumer depends on are ready.  The
+  //     consumer may want to use Tuple() mechanism to ensure the
+  //     consumer does not start until all return values of the callee
+  //     function are ready.
+  FunctionDefLibrary library = 2;
+};
--- a/src/tf/node_def.proto
+++ b/src/tf/node_def.proto
+syntax = "proto3";
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "NodeProto";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+import "attr_value.proto";
+message NodeDef {
+  // The name given to this operator. Used for naming inputs,
+  // logging, visualization, etc.  Unique within a single GraphDef.
+  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_./]*".
+  string name = 1;
+  // The operation name.  There may be custom parameters in attrs.
+  // Op names starting with an underscore are reserved for internal use.
+  string op = 2;
+  // Each input is "node:src_output" with "node" being a string name and
+  // "src_output" indicating which output tensor to use from "node". If
+  // "src_output" is 0 the ":0" suffix can be omitted.  Regular inputs
+  // may optionally be followed by control inputs that have the format
+  // "^node".
+  repeated string input = 3;
+  // A (possibly partial) specification for the device on which this
+  // node should be placed.
+  // The expected syntax for this string is as follows:
+  //
+  // DEVICE_SPEC ::= PARTIAL_SPEC
+  //
+  // PARTIAL_SPEC ::= ("/" CONSTRAINT) *
+  // CONSTRAINT ::= ("job:" JOB_NAME)
+  //              | ("replica:" [1-9][0-9]*)
+  //              | ("task:" [1-9][0-9]*)
+  //              | ("device:" [A-Za-z]* ":" ([1-9][0-9]* | "*") )
+  //
+  // Valid values for this string include:
+  // * "/job:worker/replica:0/task:1/device:GPU:3"  (full specification)
+  // * "/job:worker/device:GPU:3"                   (partial specification)
+  // * ""                                    (no specification)
+  //
+  // If the constraints do not resolve to a single device (or if this
+  // field is empty or not present), the runtime will attempt to
+  // choose a device automatically.
+  string device = 4;
+  // Operation-specific graph-construction-time configuration.
+  // Note that this should include all attrs defined in the
+  // corresponding OpDef, including those with a value matching
+  // the default -- this allows the default to change and makes
+  // NodeDefs easier to interpret on their own.  However, if
+  // an attr with a default is not specified in this list, the
+  // default will be used.
+  // The "names" (keys) must match the regexp "[a-z][a-z0-9_]+" (and
+  // one of the names from the corresponding OpDef's attr field).
+  // The values must have a type matching the corresponding OpDef
+  // attr's type field.
+  // TODO(josh11b): Add some examples here showing best practices.
+  map<string, AttrValue> attr = 5;
+};