#include "../tensor.hpp" #include "../utils.hpp" #include #include #include #include #include #include std::shared_ptr TensorDesc::create(infiniDtype_t dtype, const std::vector &shape, const std::vector &strides) { return std::shared_ptr(new TensorDesc(dtype, shape, strides)); } std::shared_ptr TensorDesc::create(infiniDtype_t dtype, const std::vector &shape) { auto ndim = shape.size(); auto strides = std::vector(ndim); if (ndim > 0) { strides[ndim - 1] = 1; for (int i = ndim - 2; i >= 0; i--) { strides[i] = strides[i + 1] * shape[i + 1]; } } return create(dtype, shape, strides); } std::shared_ptr TensorDesc::createWithOrder(infiniDtype_t dtype, const std::vector &shape, const std::vector &order) { ASSERT_EQ(shape.size(), order.size()); auto ndim = shape.size(); if (ndim == 0) { return create(dtype, shape); } auto strides = std::vector(order.size()); auto idx = std::find(order.begin(), order.end(), size_t(ndim - 1)); strides[std::distance(order.begin(), idx)] = 1; for (int i = ndim - 2; i >= 0; i--) { auto prev_dim = shape[std::distance(order.begin(), idx)]; auto prev_stride = strides[std::distance(order.begin(), idx)]; idx = std::find(order.begin(), order.end(), size_t(i)); strides[std::distance(order.begin(), idx)] = prev_stride * prev_dim; } return create(dtype, shape, strides); } infiniopTensorDescriptor_t TensorDesc::desc() const { if (_desc == nullptr) { RUN_INFINI(infiniopCreateTensorDescriptor( (infiniopTensorDescriptor_t *)(&_desc), _shape.size(), _shape.data(), _strides.data(), _dtype)); } return _desc; }; void TensorDesc::resetDesc() { if (this->_desc != nullptr) { infiniopDestroyTensorDescriptor(this->_desc); this->_desc = nullptr; } } bool TensorDesc::isContigous() const { auto ndim = this->ndim(); auto shape = this->shape(); auto strides = std::vector(ndim); strides[ndim - 1] = 1; for (int i = ndim - 2; i >= 0; i--) { strides[i] = strides[i + 1] * shape[i + 1]; } ASSERT_EQ(strides.size(), this->_strides.size()); return std::equal(strides.begin(), strides.end(), this->_strides.begin()); } std::string TensorDesc::info() const { std::stringstream ss; ss << "Tensor: " << "shape[ "; for (auto s : this->shape()) { ss << s << " "; } ss << "] strides[ "; for (auto s : this->strides()) { ss << s << " "; } ss << "] dtype=" << this->dtype(); return ss.str(); } TensorDesc::~TensorDesc() { this->resetDesc(); } const std::vector &Tensor::shape() const { return this->_desc->shape(); } const std::vector &Tensor::strides() const { return this->_desc->strides(); } size_t Tensor::ndim() const { return this->_desc->ndim(); } infiniDtype_t Tensor::dtype() const { return this->_desc->dtype(); } infiniDevice_t Tensor::deviceType() const { return this->_storage->deviceType(); } int Tensor::deviceId() const { return this->_storage->deviceId(); } Tensor::~Tensor() {} ptrdiff_t Tensor::dataOffset() const { return _offset; } infiniopTensorDescriptor_t Tensor::desc() const { return _desc->desc(); } std::shared_ptr Tensor::buffer(infiniDtype_t dtype, const std::vector &shape, std::shared_ptr pool) { std::shared_ptr tensor = std::make_shared(); auto ndim = shape.size(); size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies()); auto strides = std::vector(ndim); if (ndim > 0) { strides[ndim - 1] = 1; for (int i = ndim - 2; i >= 0; i--) { strides[i] = strides[i + 1] * shape[i + 1]; } } tensor->_storage = Storage::createFromPool(size, pool); tensor->_desc = TensorDesc::create(dtype, shape, strides); tensor->_offset = 0; return tensor; } std::shared_ptr Tensor::weight(void *data, infiniDtype_t dtype, const std::vector &shape) { std::shared_ptr tensor = std::make_shared(); auto ndim = shape.size(); size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies()); auto strides = std::vector(ndim); if (ndim > 0) { strides[ndim - 1] = 1; for (int i = ndim - 2; i >= 0; i--) { strides[i] = strides[i + 1] * shape[i + 1]; } } tensor->_storage = Storage::create(size); tensor->_desc = TensorDesc::create(dtype, shape, strides); // NOTE: 为兼容部分平台（沐曦）多线程并发对同一host数据执行memcpy卡死问题 static std::mutex mutex; { std::lock_guard lock(mutex); RUN_INFINI(infinirtMemcpy(tensor->_storage->memory(), data, size, INFINIRT_MEMCPY_H2D)); } tensor->_offset = 0; return tensor; } std::shared_ptr Tensor::memShare(const std::vector &shape, infiniDtype_t dtype_) const { auto dtype = dtype_ == INFINI_DTYPE_INVALID ? this->dtype() : dtype_; size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies()); ASSERT(size <= this->_storage->size()); std::shared_ptr tensor = std::make_shared(); auto ndim = shape.size(); auto strides = std::vector(ndim); if (ndim > 0) { strides[ndim - 1] = 1; for (int i = ndim - 2; i >= 0; i--) { strides[i] = strides[i + 1] * shape[i + 1]; } } tensor->_storage = this->_storage; tensor->_offset = 0; tensor->_desc = TensorDesc::create(dtype, shape, strides); return tensor; } void *Tensor::dataImpl(ptrdiff_t offset) const { return (char *)(this->_storage->memory()) + this->_offset + offset * dsize(this->dtype()); } void *Tensor::data(ptrdiff_t offset) { return this->dataImpl(offset); } const void *Tensor::data(ptrdiff_t offset) const { return this->dataImpl(offset); } void Tensor::copyFrom(std::shared_ptr src, infiniopHandle_t handle, infinirtStream_t stream) { ASSERT_EQ(this->shape(), src->shape()); ASSERT_EQ(this->dtype(), src->dtype()); infiniopRearrangeDescriptor_t desc; RUN_INFINI(infiniopCreateRearrangeDescriptor( handle, &desc, this->desc(), src->desc())); RUN_INFINI(infiniopRearrange(desc, this->data(), src->data(), stream)); RUN_INFINI(infiniopDestroyRearrangeDescriptor(desc)); } bool Tensor::isContigous() const { return this->_desc->isContigous(); } template void print_data(T *data, const std::vector &shape, const std::vector &strides, size_t dim) { if (dim == shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { std::cout << data[i] << " "; } std::cout << std::endl; } else if (dim < shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { print_data(data + i * strides[dim], shape, strides, dim + 1); } } } template <> void print_data(uint16_t const *data, const std::vector &shape, const std::vector &strides, size_t dim) { if (dim == shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { std::cout << f16_to_f32(data[i * strides[dim]]) << " "; } std::cout << std::endl; } else if (dim < shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { print_data(data + i * strides[dim], shape, strides, dim + 1); } } } void print_data_bf16(uint16_t const *data, const std::vector &shape, const std::vector &strides, size_t dim) { if (dim == shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { std::cout << bf16_to_f32(data[i * strides[dim]]) << " "; } std::cout << std::endl; } else if (dim < shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { print_data(data + i * strides[dim], shape, strides, dim + 1); } } } std::string Tensor::info() const { std::stringstream ss; ss << "Tensor: " << this->_desc->info() << " device=" << this->deviceType() << " device_id=" << this->deviceId(); return this->_desc->info(); } std::shared_ptr Tensor::view(const std::vector &new_shape) const { // Calculate total elements in current and new shape size_t current_elements = std::accumulate( _desc->shape().begin(), _desc->shape().end(), 1, std::multiplies()); size_t new_elements = std::accumulate( new_shape.begin(), new_shape.end(), 1, std::multiplies()); ASSERT_EQ(current_elements, new_elements); const auto &old_shape = _desc->shape(); const auto &old_strides = _desc->strides(); // Special case: empty tensor if (current_elements == 0) { auto result = std::make_shared(); result->_storage = this->_storage; result->_desc = TensorDesc::create(this->dtype(), new_shape, {}); result->_offset = this->_offset; return result; } // Special case: scalar to scalar if (old_shape.empty() && new_shape.empty()) { auto result = std::make_shared(); result->_storage = this->_storage; result->_desc = this->_desc; result->_offset = this->_offset; return result; } // Compute new strides std::vector new_strides; if (!new_shape.empty()) { new_strides.resize(new_shape.size()); // Compute strides for the new shape while preserving memory layout // Start from the rightmost dimension new_strides.back() = old_strides.back(); for (int i = new_shape.size() - 2; i >= 0; --i) { new_strides[i] = new_strides[i + 1] * new_shape[i + 1]; } // Verify the new strides are compatible with the old memory layout size_t offset = 0; for (size_t i = 0; i < old_shape.size(); ++i) { offset += (old_shape[i] - 1) * old_strides[i]; } size_t new_offset = 0; for (size_t i = 0; i < new_shape.size(); ++i) { new_offset += (new_shape[i] - 1) * new_strides[i]; } ASSERT_EQ(offset, new_offset); } // Create and return the reshaped tensor auto result = std::make_shared(); result->_storage = this->_storage; result->_desc = TensorDesc::create(this->dtype(), new_shape, new_strides); result->_offset = this->_offset; return result; } std::shared_ptr Tensor::view_as(const std::vector &new_shape, const std::vector &new_strides) const { std::shared_ptr tensor = std::make_shared(); tensor->_storage = this->_storage; tensor->_desc = TensorDesc::create(this->dtype(), new_shape, new_strides); tensor->_offset = this->_offset; return tensor; } void Tensor::debug(const std::string &filename) const { RUN_INFINI(infinirtDeviceSynchronize()); std::cout << info() << std::endl; void const *cpu_data; if (this->deviceType() != INFINI_DEVICE_CPU) { void *cpu_memory = std::malloc(this->_storage->size()); RUN_INFINI(infinirtMemcpy(cpu_memory, this->_storage->memory(), this->_storage->size(), INFINIRT_MEMCPY_D2H)); cpu_data = cpu_memory; } else { cpu_data = this->_storage->memory(); } if (!filename.empty()) { std::ofstream outFile(filename, std::ios::binary); if (!outFile) { std::cerr << "Error opening file for writing: " << filename << "\n"; return; } outFile.write(reinterpret_cast(cpu_data), this->_storage->size()); outFile.close(); std::cout << "Data written to file: " << filename << "\n"; return; } switch (this->dtype()) { case INFINI_DTYPE_F16: print_data((uint16_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_F32: print_data((float const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_U64: print_data((uint64_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_I64: print_data((int64_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_U32: print_data((uint32_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_I32: print_data((int32_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; case INFINI_DTYPE_BF16: print_data_bf16((uint16_t const *)((char const *)cpu_data + dataOffset()), this->shape(), this->strides(), 0); break; default: PANIC("Unsupported data type"); } } void Tensor::debug() const { this->debug(""); }