issue/461 InfiniCore 推理运行时

Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>

issue/461 InfiniCore 推理运行时
Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>
9a05446f · PanZezhong1725 · GitHub · 37411f6d · 37411f6d · 9a05446f
Unverified Commit 9a05446f authored Oct 11, 2025 by PanZezhong1725 Committed by GitHub Oct 11, 2025
20 changed files
--- a/src/infinicore/infinicore.cc
+++ b/src/infinicore/infinicore.cc
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <infinicore.hpp>
-
-namespace py = pybind11;
-
-namespace infinicore {
-
-PYBIND11_MODULE(infinicore, m) {
-    py::enum_<DataType>(m, "dtype")
-        .value("bfloat16", DataType::bfloat16)
-        .value("float16", DataType::float16)
-        .value("float32", DataType::float32)
-        .value("float64", DataType::float64)
-        .value("int32", DataType::int32)
-        .value("int64", DataType::int64)
-        .value("uint8", DataType::uint8)
-        .export_values();
-
-    py::class_<Device>(m, "Device")
-        .def(py::init<const Device::Type &, const Device::Index &>(),
-             py::arg("type"), py::arg("index") = 0)
-        .def_property_readonly("type", &Device::get_type)
-        .def_property_readonly("index", &Device::get_index)
-        .def("__repr__", static_cast<std::string (Device::*)() const>(&Device::to_string));
-
-    py::class_<Tensor>(m, "Tensor")
-        .def(py::init<const Tensor::Shape &, const DataType &, const Device &>(),
-             py::arg("shape"), py::arg("dtype") = DataType::float32, py::arg("device") = Device{Device::Type::cpu})
-        .def_property_readonly("shape", &Tensor::get_shape)
-        .def_property_readonly("dtype", &Tensor::get_dtype)
-        .def_property_readonly("device", &Tensor::get_device);
-}
-
-} // namespace infinicore
--- a/src/infinicore/memory.cc
+++ b/src/infinicore/memory.cc
+#include "infinicore/memory.hpp"
+
+namespace infinicore {
+
+Memory::Memory(std::byte *data,
+               size_t size,
+               Device device,
+               Memory::Deleter deleter,
+               bool pin_memory)
+    : data_{data}, size_{size}, device_{device}, deleter_{deleter}, is_pinned_(pin_memory) {}
+
+Memory::~Memory() {
+    if (deleter_) {
+        deleter_(data_);
+    }
+}
+
+std::byte *Memory::data() {
+    return data_;
+}
+
+Device Memory::device() const {
+    return device_;
+}
+
+size_t Memory::size() const {
+    return size_;
+}
+
+bool Memory::is_pinned() const {
+    return is_pinned_;
+}
+} // namespace infinicore
--- a/src/infinicore/op/matmul/matmul.cc
+++ b/src/infinicore/op/matmul/matmul.cc
+#include "infinicore/op/matmul.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Matmul::schema> &Matmul::dispatcher() {
+    static common::OpDispatcher<Matmul::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Matmul::execute(Tensor c, Tensor a, Tensor b) {
+    dispatcher().lookup(context::getDevice().getType())(c, a, b);
+}
+
+Tensor matmul(Tensor a, Tensor b) {
+    Shape shape = a->shape();
+    Size size = a->ndim();
+    shape[size - 1] = b->size(size - 1);
+    auto c = Tensor::empty(shape, a->dtype(), a->device());
+    matmul_(c, a, b);
+    return c;
+}
+
+void matmul_(Tensor c, Tensor a, Tensor b) {
+    Matmul::execute(c, a, b);
+}
+} // namespace infinicore::op
--- a/src/infinicore/op/matmul/matmul_infiniop.cc
+++ b/src/infinicore/op/matmul/matmul_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/op/common/cache.hpp"
+#include "infinicore/op/matmul.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::matmul_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopGemmDescriptor_t> caches(
+    100, // capacity
+    [](infiniopGemmDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyGemmDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor c, Tensor a, Tensor b) {
+    size_t seed = hash_combine(c, b, a);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopGemmDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(context::getInfiniopHandle(), &desc, c->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopGemm(
+        desc, workspace->data(), workspace_size,
+        c->data(), a->data(), b->data(), 1.f, 0.f, context::getStream()));
+}
+
+static bool registered = []() {
+    Matmul::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::matmul_impl::infiniop
--- a/src/infinicore/op/ones/ones.cc
+++ b/src/infinicore/op/ones/ones.cc
+#include "infinicore/op/ones.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Ones::schema> &Ones::dispatcher() {
+    static common::OpDispatcher<Ones::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Ones::execute(Tensor output) {
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/op/rearrange/rearrange.cc
+++ b/src/infinicore/op/rearrange/rearrange.cc
+#include "infinicore/op/rearrange.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Rearrange::schema> &Rearrange::dispatcher() {
+    static common::OpDispatcher<Rearrange::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Rearrange::execute(Tensor y, Tensor x) {
+    dispatcher().lookup(context::getDevice().getType())(y, x);
+}
+
+Tensor rearrange(Tensor x) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    rearrange_(y, x);
+    return y;
+}
+
+void rearrange_(Tensor y, Tensor x) {
+    Rearrange::execute(y, x);
+}
+} // namespace infinicore::op
--- a/src/infinicore/op/rearrange/rearrange_infiniop.cc
+++ b/src/infinicore/op/rearrange/rearrange_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/op/common/cache.hpp"
+#include "infinicore/op/rearrange.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rearrange_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRearrangeDescriptor_t> caches(
+    100, // capacity
+    [](infiniopRearrangeDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRearrangeDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x) {
+    size_t seed = hash_combine(y, x);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopRearrangeDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRearrangeDescriptor(context::getInfiniopHandle(), &desc, y->desc(), x->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    INFINICORE_CHECK_ERROR(
+        infiniopRearrange(
+            desc,
+            y->data(),
+            x->data(),
+            context::getStream()));
+}
+
+static bool registered = []() {
+    Rearrange::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::rearrange_impl::infiniop
--- a/src/infinicore/pybind11/context.hpp
+++ b/src/infinicore/pybind11/context.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::context {
+
+inline void bind(py::module &m) {
+    m.def("get_device", &getDevice);
+    m.def("get_device_count", &getDeviceCount);
+}
+
+} // namespace infinicore::context
--- a/src/infinicore/pybind11/device.hpp
+++ b/src/infinicore/pybind11/device.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::device {
+
+inline void bind(py::module &m) {
+    py::class_<Device> device(m, "Device");
+
+    py::enum_<Device::Type>(device, "Type")
+        .value("CPU", Device::Type::CPU)
+        .value("NVIDIA", Device::Type::NVIDIA)
+        .value("CAMBRICON", Device::Type::CAMBRICON)
+        .value("ASCEND", Device::Type::ASCEND)
+        .value("METAX", Device::Type::METAX)
+        .value("MOORE", Device::Type::MOORE)
+        .value("ILUVATAR", Device::Type::ILUVATAR)
+        .value("KUNLUN", Device::Type::KUNLUN)
+        .value("SUGON", Device::Type::SUGON)
+        .value("COUNT", Device::Type::COUNT);
+
+    device
+        .def(py::init<const Device::Type &, const Device::Index &>(),
+             py::arg("type") = Device::Type::CPU, py::arg("index") = 0)
+        .def_property_readonly("type", &Device::getType)
+        .def_property_readonly("index", &Device::getIndex)
+        .def("__str__", static_cast<std::string (Device::*)() const>(&Device::toString));
+}
+
+} // namespace infinicore::device
--- a/src/infinicore/pybind11/dtype.hpp
+++ b/src/infinicore/pybind11/dtype.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::dtype {
+
+inline void bind(py::module &m) {
+    py::enum_<DataType>(m, "DataType")
+        .value("BYTE", DataType::BYTE)
+        .value("BOOL", DataType::BOOL)
+        .value("I8", DataType::I8)
+        .value("I16", DataType::I16)
+        .value("I32", DataType::I32)
+        .value("I64", DataType::I64)
+        .value("U8", DataType::U8)
+        .value("U16", DataType::U16)
+        .value("U32", DataType::U32)
+        .value("U64", DataType::U64)
+        .value("F8", DataType::F8)
+        .value("F16", DataType::F16)
+        .value("F32", DataType::F32)
+        .value("F64", DataType::F64)
+        .value("C16", DataType::C16)
+        .value("C32", DataType::C32)
+        .value("C64", DataType::C64)
+        .value("C128", DataType::C128)
+        .value("BF16", DataType::BF16);
+}
+
+} // namespace infinicore::dtype
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "context.hpp"
+#include "device.hpp"
+#include "dtype.hpp"
+#include "op.hpp"
+#include "tensor.hpp"
+
+namespace infinicore {
+
+PYBIND11_MODULE(_infinicore, m) {
+    context::bind(m);
+    device::bind(m);
+    dtype::bind(m);
+    op::bind(m);
+    tensor::bind(m);
+}
+
+} // namespace infinicore
--- a/src/infinicore/pybind11/op.hpp
+++ b/src/infinicore/pybind11/op.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "op/matmul.hpp"
+#include "op/rearrange.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::op {
+
+inline void bind(py::module &m) {
+    bind_matmul(m);
+    bind_rearrange(m);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/pybind11/op/matmul.hpp
+++ b/src/infinicore/pybind11/op/matmul.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/op/matmul.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::op {
+
+inline void bind_matmul(py::module &m) {
+    m.def("matmul",
+          &op::matmul,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(Matrix multiplication of two tensors.)doc");
+
+    m.def("matmul_",
+          &op::matmul_,
+          py::arg("c"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place matrix multiplication.)doc");
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/pybind11/op/rearrange.hpp
+++ b/src/infinicore/pybind11/op/rearrange.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/op/rearrange.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::op {
+
+inline void bind_rearrange(py::module &m) {
+    m.def("rearrange",
+          &op::rearrange,
+          py::arg("x"),
+          R"doc(Matrix rearrangement of a tensor.)doc");
+
+    m.def("rearrange_",
+          &op::rearrange_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(In-place tensor rearrangement.)doc");
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/pybind11/tensor.hpp
+++ b/src/infinicore/pybind11/tensor.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::tensor {
+
+inline void bind(py::module &m) {
+    py::class_<Tensor>(m, "Tensor")
+        .def_property_readonly("shape", [](const Tensor &tensor) { return tensor->shape(); })
+        .def_property_readonly("strides", [](const Tensor &tensor) { return tensor->strides(); })
+        .def_property_readonly("ndim", [](const Tensor &tensor) { return tensor->ndim(); })
+        .def_property_readonly("dtype", [](const Tensor &tensor) { return tensor->dtype(); })
+
+        .def("data_ptr", [](const Tensor &tensor) { return tensor->data(); })
+        .def("size", [](const Tensor &tensor, std::size_t dim) { return tensor->size(dim); })
+        .def("stride", [](const Tensor &tensor, std::size_t dim) { return tensor->stride(dim); })
+        .def("numel", [](const Tensor &tensor) { return tensor->numel(); })
+
+        .def("is_contiguous", [](const Tensor &tensor) { return tensor->is_contiguous(); })
+        .def("is_pinned", [](const Tensor &tensor) { return tensor->is_pinned(); })
+        .def("info", [](const Tensor &tensor) { return tensor->info(); })
+
+        .def("copy_", [](Tensor &tensor, const Tensor &other) { tensor->copy_from(other); })
+        .def("to", [](const Tensor &tensor, const Device &device) { return tensor->to(device); })
+        .def("as_strided", [](const Tensor &tensor, const Shape &shape, const Strides &strides) { return tensor->as_strided(shape, strides); })
+        .def("contiguous", [](const Tensor &tensor) { return tensor->contiguous(); })
+
+        .def("permute", [](const Tensor &tensor, const Shape &dims) { return tensor->permute(dims); })
+        .def("view", [](const Tensor &tensor, const Shape &shape) { return tensor->view(shape); });
+
+    m.def("empty", &Tensor::empty,
+          py::arg("shape"),
+          py::arg("dtype"),
+          py::arg("device"),
+          py::arg("pin_memory") = false);
+    m.def("strided_empty", &Tensor::strided_empty,
+          py::arg("shape"),
+          py::arg("strides"),
+          py::arg("dtype"),
+          py::arg("device"),
+          py::arg("pin_memory") = false);
+    m.def("zeros", &Tensor::zeros,
+          py::arg("shape"),
+          py::arg("dtype"),
+          py::arg("device"),
+          py::arg("pin_memory") = false);
+    m.def("ones", &Tensor::ones,
+          py::arg("shape"),
+          py::arg("dtype"),
+          py::arg("device"),
+          py::arg("pin_memory") = false);
+
+    m.def(
+        "from_blob", [](uintptr_t raw_ptr, Shape &shape, const DataType &dtype, const Device &device) {
+            return Tensor{infinicore::Tensor::from_blob(reinterpret_cast<void *>(raw_ptr), shape, dtype, device)};
+        },
+        pybind11::arg("raw_ptr"), pybind11::arg("shape"), pybind11::arg("dtype"), pybind11::arg("device"));
+
+    m.def(
+        "strided_from_blob", [](uintptr_t raw_ptr, Shape &shape, Strides &strides, const DataType &dtype, const Device &device) {
+            return Tensor{infinicore::Tensor::strided_from_blob(reinterpret_cast<void *>(raw_ptr), shape, strides, dtype, device)};
+        },
+        pybind11::arg("raw_ptr"), pybind11::arg("shape"), pybind11::arg("strides"), pybind11::arg("dtype"), pybind11::arg("device"));
+}
+
+} // namespace infinicore::tensor
--- a/src/infinicore/tensor.cc
+++ b/src/infinicore/tensor.cc
-#include <infinicore.hpp>
-
-namespace infinicore {
-
-Tensor::Tensor(const Shape &shape, const DataType &dtype, const Device &device) : shape_{shape}, dtype_{dtype}, device_{device} {}
-
-const Tensor::Shape &Tensor::get_shape() const {
-    return shape_;
-}
-
-const DataType &Tensor::get_dtype() const {
-    return dtype_;
-}
-
-const Device &Tensor::get_device() const {
-    return device_;
-}
-
-} // namespace infinicore
--- a/src/infinicore/tensor/copy.cc
+++ b/src/infinicore/tensor/copy.cc
+#include "infinicore/context/context.hpp"
+#include "infinicore/dtype.hpp"
+#include "infinicore/ops.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <spdlog/spdlog.h>
+
+namespace infinicore {
+Tensor TensorImpl::to(Device device) const {
+    if (device == data_.memory->device()) {
+        return Tensor(const_cast<TensorImpl *>(this)->shared_from_this());
+    } else {
+        std::shared_ptr<TensorImpl> _t = empty(meta_.shape, meta_.dtype, device, true);
+        _t->copy_from(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()));
+        return Tensor(_t);
+    }
+}
+
+void TensorImpl::copy_from(Tensor src) {
+    if (src->shape() != this->shape()) {
+        throw std::runtime_error("Cannot copy from tensor with different shape");
+    }
+    if (this->device().getType() == src->device().getType()) {
+        op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), src);
+    } else {
+        if (!src->is_contiguous()) {
+            src = src->contiguous();
+        }
+        if (this->device().getType() == Device::Type::CPU) {
+            if (this->is_contiguous()) {
+                context::memcpyD2H(this->data(), src->data(), this->data_.memory->size());
+            } else {
+                auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
+                context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
+                op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
+            }
+        } else if (src->device().getType() == Device::Type::CPU) {
+            if (this->is_contiguous()) {
+                context::memcpyH2D(this->data(), src->data(), this->data_.memory->size());
+            } else {
+                auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
+                context::memcpyH2D(local_src->data(), src->data(), this->data_.memory->size());
+                op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
+            }
+        }
+    }
+}
+
+Tensor TensorImpl::contiguous() const {
+    if (is_contiguous()) {
+        return Tensor(const_cast<TensorImpl *>(this)->shared_from_this());
+    } else {
+        return op::rearrange(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()));
+    }
+}
+
+} // namespace infinicore
--- a/src/infinicore/tensor/tensor.cc
+++ b/src/infinicore/tensor/tensor.cc
+#include "infinicore/tensor.hpp"
+#include "../utils.hpp"
+#include "infinicore/context/context.hpp"
+#include "infinicore/dtype.hpp"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+// Helper function to calculate contiguous strides
+inline infinicore::Strides calculate_contiguous_strides(const infinicore::Shape &shape) {
+    infinicore::Strides strides(shape.size());
+    infinicore::Stride stride = 1;
+    for (int i = shape.size() - 1; i >= 0; --i) {
+        strides[i] = stride;
+        stride *= shape[i];
+    }
+    return strides;
+}
+} // namespace
+
+namespace infinicore {
+TensorImpl *Tensor::operator->() { return impl_.get(); }
+
+const TensorImpl *Tensor::operator->() const { return impl_.get(); }
+
+Tensor Tensor::empty(const Shape &shape,
+                     const DataType &dtype,
+                     const Device &device,
+                     bool pin_memory) {
+    return Tensor{TensorImpl::empty(shape, dtype, device, pin_memory)};
+}
+
+Tensor Tensor::strided_empty(const Shape &shape,
+                             const Strides &strides,
+                             const DataType &dtype,
+                             const Device &device,
+                             bool pin_memory) {
+    return Tensor{TensorImpl::strided_empty(shape, strides, dtype, device, pin_memory)};
+}
+
+Tensor Tensor::zeros(const Shape &shape,
+                     const DataType &dtype,
+                     const Device &device,
+                     bool pin_memory) {
+    return Tensor{TensorImpl::zeros(shape, dtype, device, pin_memory)};
+}
+
+Tensor Tensor::ones(const Shape &shape,
+                    const DataType &dtype,
+                    const Device &device,
+                    bool pin_memory) {
+    return Tensor{TensorImpl::ones(shape, dtype, device, pin_memory)};
+}
+
+Tensor Tensor::from_blob(void *raw_ptr, const Shape &shape, const DataType &dtype, const Device &device) {
+    return Tensor{TensorImpl::from_blob(raw_ptr, shape, dtype, device)};
+}
+
+Tensor Tensor::strided_from_blob(void *raw_ptr, const Shape &shape, const Strides &strides, const DataType &dtype, const Device &device) {
+    return Tensor{TensorImpl::strided_from_blob(raw_ptr, shape, strides, dtype, device)};
+}
+
+TensorMetaData::TensorMetaData(const Shape &_shape, const Strides &_strides, const DataType &_dtype)
+    : shape(_shape), strides(_strides), dtype(_dtype) {
+    INFINICORE_CHECK_ERROR(infiniopCreateTensorDescriptor(&desc, shape.size(), shape.data(), strides.data(), (infiniDtype_t)dtype));
+}
+
+TensorImpl::TensorImpl(const Shape &shape, const DataType &dtype)
+    : meta_(TensorMetaData(shape, calculate_contiguous_strides(shape), dtype)) {}
+
+TensorImpl::TensorImpl(const Shape &shape, const Strides &strides, const DataType &dtype)
+    : meta_(TensorMetaData(shape, strides, dtype)) {}
+
+std::byte *TensorImpl::data() {
+    return data_.memory->data() + data_.offset;
+}
+
+const std::byte *TensorImpl::data() const {
+    return data_.memory->data() + data_.offset;
+}
+
+const Shape &TensorImpl::shape() const {
+    return meta_.shape;
+}
+
+const Strides &TensorImpl::strides() const {
+    return meta_.strides;
+}
+
+Size TensorImpl::ndim() const {
+    return meta_.shape.size();
+}
+
+bool TensorImpl::is_contiguous() const {
+    Stride expected_stride = 1;
+    for (int i = meta_.shape.size() - 1; i >= 0; --i) {
+        if (meta_.strides[i] != expected_stride) {
+            return false;
+        }
+        expected_stride *= meta_.shape[i];
+    }
+    return true;
+}
+
+Size TensorImpl::numel() const {
+    Size total = 1;
+    for (const auto &dim : meta_.shape) {
+        total *= dim;
+    }
+    return total;
+}
+
+Size TensorImpl::size(size_t dim) const {
+    return meta_.shape[dim];
+}
+
+Stride TensorImpl::stride(size_t dim) const {
+    return meta_.strides[dim];
+}
+
+DataType TensorImpl::dtype() const {
+    return meta_.dtype;
+}
+
+Device TensorImpl::device() const {
+    return data_.memory->device();
+}
+
+infiniopTensorDescriptor_t TensorImpl::desc() const {
+    return meta_.desc;
+}
+
+bool TensorImpl::is_pinned() const {
+    return data_.memory->is_pinned();
+}
+
+std::string TensorImpl::info() const {
+    std::stringstream ss;
+
+    ss << "Tensor: "
+       << "shape[ ";
+    for (auto s : this->shape()) {
+        ss << s << " ";
+    }
+    ss << "] strides[ ";
+    for (auto s : this->strides()) {
+        ss << s << " ";
+    }
+    ss << "] dtype=" << toString(this->dtype());
+
+    return ss.str();
+}
+
+std::shared_ptr<TensorImpl> TensorImpl::empty(const Shape &shape,
+                                              const DataType &dtype,
+                                              const Device &device,
+                                              bool pin_memory) {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape, dtype));
+    t->data_.offset = 0;
+
+    context::setDevice(device);
+
+    if (device == Device::Type::CPU) {
+        if (pin_memory) {
+            if (context::getDevice() == Device::Type::CPU) {
+                spdlog::warn("Tensor memory is not pinned by any device with CPU runtime.");
+                t->data_.memory = context::allocateHostMemory(t->numel() * dsize(dtype));
+            } else {
+                t->data_.memory = context::allocatePinnedHostMemory(t->numel() * dsize(dtype));
+            }
+        } else {
+            t->data_.memory = context::allocateHostMemory(t->numel() * dsize(dtype));
+        }
+    } else {
+        t->data_.memory = context::allocateMemory(t->numel() * dsize(dtype));
+    }
+
+    return t;
+}
+
+std::shared_ptr<TensorImpl> TensorImpl::strided_empty(
+    const Shape &shape,
+    const Strides &strides,
+    const DataType &dtype,
+    const Device &device,
+    bool pin_memory) {
+
+    auto impl = std::shared_ptr<TensorImpl>(new TensorImpl(shape, strides, dtype));
+    impl->data_.offset = 0;
+
+    context::setDevice(device);
+
+    size_t max_offset = 0;
+
+    for (size_t i = 0; i < shape.size(); ++i) {
+        if (shape[i] > 0) {
+            max_offset += (shape[i] - 1) * strides[i];
+        }
+    }
+
+    size_t required_elements = max_offset + 1;
+    size_t required_bytes = required_elements * dsize(dtype);
+
+    if (device == Device::Type::CPU) {
+        if (pin_memory) {
+            if (context::getDevice() == Device::Type::CPU) {
+                spdlog::warn("Tensor memory is not pinned by any device with CPU runtime.");
+                impl->data_.memory = context::allocateHostMemory(required_bytes);
+            } else {
+                impl->data_.memory = context::allocatePinnedHostMemory(required_bytes);
+            }
+        } else {
+            impl->data_.memory = context::allocateHostMemory(required_bytes);
+        }
+    } else {
+        impl->data_.memory = context::allocateMemory(required_bytes);
+    }
+
+    return impl;
+}
+
+std::shared_ptr<TensorImpl> TensorImpl::zeros(const Shape &shape,
+                                              const DataType &dtype,
+                                              const Device &device,
+                                              bool pin_memory) {
+    // TODO: Implement this.
+    return empty(shape, dtype, device, pin_memory);
+}
+std::shared_ptr<TensorImpl> TensorImpl::ones(const Shape &shape,
+                                             const DataType &dtype,
+                                             const Device &device,
+                                             bool pin_memory) {
+    // TODO: Implement this.
+    return empty(shape, dtype, device, pin_memory);
+}
+
+std::shared_ptr<TensorImpl> TensorImpl::from_blob(
+    void *raw_ptr,
+    const Shape &shape,
+    const DataType &dtype,
+    const Device &device) {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape, dtype));
+    t->data_.offset = 0;
+    t->data_.memory = std::make_shared<Memory>((std::byte *)raw_ptr, t->numel() * dsize(dtype), device, nullptr);
+    return t;
+}
+
+std::shared_ptr<TensorImpl> TensorImpl::strided_from_blob(
+    void *raw_ptr,
+    const Shape &shape,
+    const Strides &strides,
+    const DataType &dtype,
+    const Device &device) {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape, strides, dtype));
+    t->data_.offset = 0;
+    t->data_.memory = std::make_shared<Memory>((std::byte *)raw_ptr, t->numel() * dsize(dtype), device, nullptr);
+    return t;
+}
+
+} // namespace infinicore
--- a/src/infinicore/tensor/view.cc
+++ b/src/infinicore/tensor/view.cc
+#include "infinicore/context/context.hpp"
+#include "infinicore/dtype.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <spdlog/spdlog.h>
+
+namespace infinicore {
+Tensor TensorImpl::narrow(const std::vector<TensorSliceParams> &slices) const {
+    // Create new shape and calculate offset
+    Shape new_shape = meta_.shape;
+    size_t offset = data_.offset;
+
+    for (const auto &slice : slices) {
+        assert(slice.len > 0);
+        assert(meta_.shape[slice.dim] >= slice.start + slice.len);
+        new_shape[slice.dim] = slice.len;
+        offset += slice.start * meta_.strides[slice.dim] * dsize(meta_.dtype);
+    }
+
+    // Create new tensor with the same strides but narrowed shape
+    auto tensor_impl = std::make_shared<TensorImpl>(new_shape, meta_.strides, meta_.dtype);
+    tensor_impl->data_.offset = offset;
+    tensor_impl->data_.memory = data_.memory;
+
+    return Tensor(tensor_impl);
+}
+
+Tensor TensorImpl::permute(const Shape &order) const {
+    // Validate input
+    assert(meta_.shape.size() == order.size());
+
+    // Check that order contains all indices from 0 to n-1 exactly once
+    for (size_t i = 0; i < order.size(); i++) {
+        assert(std::find(order.begin(), order.end(), i) != order.end());
+    }
+
+    // Permute shape and strides
+    Shape new_shape(order.size());
+    Strides new_strides(order.size());
+
+    for (size_t i = 0; i < order.size(); i++) {
+        new_shape[i] = meta_.shape[order[i]];
+        new_strides[i] = meta_.strides[order[i]];
+    }
+
+    auto tensor_impl = std::make_shared<TensorImpl>(new_shape, new_strides, meta_.dtype);
+    tensor_impl->data_ = data_;
+
+    return Tensor(tensor_impl);
+}
+
+Tensor TensorImpl::view(const Shape &new_shape) const {
+    // Step 1: Validate total size
+    Size numel = 1;
+    for (Size dim : meta_.shape) {
+        numel *= dim;
+    }
+
+    Size new_numel = 1;
+    for (Size dim : new_shape) {
+        new_numel *= dim;
+    }
+
+    assert(numel == new_numel);
+
+    // Step 2: Get current shape and strides
+    const Shape &old_shape = meta_.shape;
+    const Strides &old_strides = meta_.strides;
+
+    // Step 3: Create merged shape and strides
+    Shape merged_shape;
+    Strides merged_strides;
+
+    if (!old_shape.empty()) {
+        merged_shape.push_back(old_shape[0]);
+        merged_strides.push_back(old_strides[0]);
+
+        for (size_t i = 1; i < old_shape.size(); ++i) {
+            if (old_strides[i] * static_cast<Stride>(old_shape[i]) == merged_strides.back()) {
+                merged_shape.back() *= old_shape[i];
+                merged_strides.back() = old_strides[i];
+            } else {
+                merged_shape.push_back(old_shape[i]);
+                merged_strides.push_back(old_strides[i]);
+            }
+        }
+    }
+
+    // Step 4: Compute new strides by splitting merged dimensions
+    Strides new_strides(new_shape.size());
+    size_t merged_idx = 0;
+    Stride current_stride = merged_strides[0];
+    Size remaining_size = merged_shape[0];
+
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+        // Find which merged dimension contains this new dimension
+        while (new_shape[i] > remaining_size) {
+            assert(++merged_idx < merged_shape.size());
+            current_stride = merged_strides[merged_idx];
+            remaining_size = merged_shape[merged_idx];
+        }
+
+        assert(remaining_size % new_shape[i] == 0);
+
+        new_strides[i] = current_stride * (remaining_size / new_shape[i]);
+        remaining_size /= new_shape[i];
+    }
+
+    return this->as_strided(new_shape, new_strides);
+}
+
+Tensor TensorImpl::as_strided(const Shape &new_shape, const Strides &new_strides) const {
+    auto tensor_impl = std::make_shared<TensorImpl>(new_shape, new_strides, meta_.dtype);
+    tensor_impl->data_ = data_;
+
+    return Tensor(tensor_impl);
+}
+} // namespace infinicore
--- a/src/infinicore/utils.hpp
+++ b/src/infinicore/utils.hpp
+#pragma once
+
+#include "../utils/infini_status_string.h"
+
+#include <spdlog/cfg/env.h>
+#include <spdlog/spdlog.h>
+#include <stdexcept>
+
+inline struct SpdlogInitializer {
+    SpdlogInitializer() {
+        if (!std::getenv("INFINICORE_LOG_LEVEL")) {
+            spdlog::set_level(spdlog::level::off);
+        } else {
+            spdlog::cfg::load_env_levels("INFINICORE_LOG_LEVEL");
+        }
+    }
+} spdlog_initializer;
+
+#define STRINGIZE_(x) #x
+#define STRINGIZE(x) STRINGIZE_(x)
+
+#define INFINICORE_CHECK_ERROR(call)                                                                         \
+    do {                                                                                                     \
+        spdlog::info("Entering `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                     \
+        infiniStatus_t ret = (call);                                                                         \
+        spdlog::info("Exiting `" #call "` at `" __FILE__ ":" STRINGIZE(__LINE__) "`.");                      \
+        if (ret != INFINI_STATUS_SUCCESS) {                                                                  \
+            throw std::runtime_error(#call " failed with error: " + std::string(infini_status_string(ret))); \
+        }                                                                                                    \
+    } while (false)