Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
// Copyright 2020 xmyqsh
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <spconv/points2voxels.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor coors, std::vector<int64_t> gridShape,
const int64_t ndim);
} // namespace spconv
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
std::vector<int64_t> gridShape, const int ndim);
void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim);
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <spconv/maxpool.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct);
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <cuda_fp16.h>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void gatherVecKernel(T *buffer, const T *features,
const Index *indices, int size, int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void gatherVecBlockKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
threadIdx.y];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchGatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp];
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] * numPlanes + iy];
} else {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void
batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
int size, int feature_offset, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds[NumILP];
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(
features)[inds[ilp] * numPlanes + iy];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void
batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
int size, int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds_elem;
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[inds * numPlanes + threadIdx.y];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
outFeatures[inds[ilp] + iy] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
T buf[vecloadFactor];
T buf2[vecloadFactor];
Index idx;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<VecType *>(buf)[0] =
reinterpret_cast<VecType *>(outFeatures)[idx];
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
if (std::is_same<T, at::Half>::value) {
#if __CUDA_ARCH__ >= 530
#pragma unroll
for (int i = 0; i < vecloadHalf2Factor; i++) {
reinterpret_cast<__half2 *>(buf)[i] =
__hadd2(reinterpret_cast<__half2 *>(buf)[i],
reinterpret_cast<__half2 *>(buf2)[i]);
}
#else
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
#endif
} else {
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idx] =
reinterpret_cast<VecType *>(buf)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
}
}
}
#if __CUDA_ARCH__ >= 530
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index idx;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
reinterpret_cast<__half2 *>(outFeatures)[idx],
reinterpret_cast<__half2 *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
#endif
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int feature_offset, int numPlanes,
int indice_batch_stride,
int feature_batch_stride) {
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds, inds_elem;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
}
} // namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor features, torch::Tensor indices,
int size);
void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
enum ConvAlgo {
kNative = 0,
kBatch,
kBatchGemmGather,
kSparseConvNet,
kMinkowskiEngine
};
using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
kSparseConvNet, kMinkowskiEngine>;
// torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash);
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo);
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo);
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace spconv {
template <typename T>
using determine_acc_t =
std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
template <typename T, bool TransA, bool TransB, bool TransC>
cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
T const *A, int lda, T const *B, int ldb, T beta, T *C,
int ldc) {
// Define type definition for single-precision CUTLASS GEMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible
// compositions including the following example for single-precision GEMM.
// Typical values are used as default template arguments. See
// `cutlass/gemm/device/default_gemm_configuration.h` for more details.
//
// To view the full gemm device API interface, see
// `cutlass/gemm/device/gemm.h`
using TAcc = determine_acc_t<T>;
using ColumnMajor = cutlass::layout::ColumnMajor;
using RowMajor = cutlass::layout::RowMajor;
using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
LayoutA, // Layout of A matrix
T, // Data-type of B matrix
LayoutB, // Layout of B matrix
T, // Data-type of C matrix
LayoutC,
TAcc>; // Layout of C matrix
// Define a CUTLASS GEMM type
CutlassGemm gemm_operator;
// Construct the CUTLASS GEMM arguments object.
//
// One of CUTLASS's design patterns is to define gemm argument objects that
// are constructible in host code and passed to kernels by value. These may
// include pointers, strides, scalars, and other arguments needed by Gemm and
// its components.
//
// The benefits of this pattern are (1.) a structured, composable strategy for
// passing host-constructible arguments to kernels and (2.) minimized
// initialization overhead on kernel entry.
//
typename CutlassGemm::Arguments args(
{M, N, K}, // Gemm Problem dimensions
{A, lda}, // Tensor-ref for source matrix A
{B, ldb}, // Tensor-ref for source matrix B
{C, ldc}, // Tensor-ref for source matrix C
{C, ldc}, // Tensor-ref for destination matrix D (may be different memory
// than source C matrix)
{alpha, beta}); // Scalars used in the Epilogue
//
// Launch the CUTLASS GEMM kernel.
//
cutlass::Status status = gemm_operator(args, nullptr, s);
//
// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
//
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
// Return success, if no errors were encountered.
return cudaSuccess;
}
} // namespace spconv
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b);
} // namespace spconv
\ No newline at end of file
#include <tensorview/tensor.h>
namespace spconv {
enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };
template <int Impl> struct HashMap;
template <> struct HashMap<kDenseMap> {};
} // namespace spconv
\ No newline at end of file
#include "NvInfer.h"
#include <memory>
#include <tensorview/tensor.h>
#include <unordered_map>
#include <vector>
namespace trt {
template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
return tv::float32;
case nvinfer1::DataType::kHALF:
return tv::float16;
case nvinfer1::DataType::kINT32:
return tv::int32;
case nvinfer1::DataType::kINT8:
return tv::int8;
default:;
}
TV_THROW_INVALID_ARG("unknown trt dtype");
}
struct InferDeleter {
template <typename T> void operator()(T *obj) const {
if (obj) {
obj->destroy();
}
}
};
template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
class Logger : public nvinfer1::ILogger {
public:
Logger(Severity severity = Severity::kWARNING)
: reportableSeverity(severity) {}
void log(Severity severity, const char *msg) override {
// suppress messages with severity enum value greater than the reportable
if (severity > reportableSeverity)
return;
switch (severity) {
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "UNKNOWN: ";
break;
}
std::cerr << msg << std::endl;
}
Severity reportableSeverity;
};
class InferenceContext {
public:
explicit InferenceContext(const std::string &engine_bin, int device)
: logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
deviceCount, " device.");
}
cudaSetDevice(device);
auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
nvinfer1::createInferRuntime(logger_));
engine_ =
trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
engine_bin.c_str(), engine_bin.size(), nullptr));
ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
engine_->createExecutionContext());
max_batch_size_ = engine_->getMaxBatchSize();
for (int i = 0; i < engine_->getNbBindings(); ++i) {
auto dims = engine_->getBindingDimensions(i);
std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
shape_vec.insert(shape_vec.begin(), {max_batch_size_});
tv::TensorShape shape(shape_vec);
std::string name = engine_->getBindingName(i);
auto trt_dtype = engine_->getBindingDataType(i);
auto tv_dtype = trt_dtype_to_tv(trt_dtype);
bool isInput = engine_->bindingIsInput(i);
name_to_idx_[name] = i;
idx_to_name_[i] = name;
name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
if (isInput)
inp_idxes_.push_back(i);
else
out_idxes_.push_back(i);
bindings_.push_back(name_to_dev_mem_[name].raw_data());
}
checkCudaErrors(cudaStreamCreate(&stream_));
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::vector<tv::Tensor> inputs) {
TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
inp_idxes_.size(), "inputs, but got", inputs.size());
// inference batch size
int bs = inputs[0].dim(0);
for (auto &inp : inputs) {
TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
"batch sizes of all input must same");
}
TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
max_batch_size_);
for (int i = 0; i < inputs.size(); ++i) {
auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
auto shape_inp = inputs[i].shape().subshape(1);
auto shape_dev = dev_mem.shape().subshape(1);
TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
"shape except batch must same", shape_inp,
shape_dev);
dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
stream_);
}
ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
for (int i : out_idxes_) {
name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
}
checkCudaErrors(cudaStreamSynchronize(stream_));
std::unordered_map<std::string, tv::Tensor> output_map;
for (int i = 0; i < out_idxes_.size(); ++i) {
auto name = idx_to_name_[out_idxes_[i]];
output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
}
return output_map;
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
int count = 0;
for (auto &p : inputs) {
auto iter = name_to_idx_.find(p.first);
TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
p.first);
inputs_vec[name_to_idx_[p.first]] = p.second;
}
TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
return (*this)(inputs_vec);
}
tv::Tensor operator[](std::string name) {
auto iter = name_to_host_mem_.find(name);
if (iter == name_to_host_mem_.end()) {
TV_THROW_INVALID_ARG(name, "not found.");
}
return iter->second;
}
std::string repr() {
std::stringstream ss;
ss << "InferenceContext[gpu=" << device_ << "]";
ss << "\n Inputs:";
std::string name;
for (auto &i : inp_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
ss << "\n Outputs:";
for (auto &i : out_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
return ss.str();
}
private:
Logger logger_;
trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
std::unordered_map<std::string, int> name_to_idx_;
std::unordered_map<int, std::string> idx_to_name_;
std::vector<int> inp_idxes_;
std::vector<int> out_idxes_;
std::vector<void *> bindings_;
cudaStream_t stream_;
int max_batch_size_;
int device_;
};
} // namespace trt
/*
From PyTorch:
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
Samy Bengio, Johnny Mariethoz)
From Caffe2:
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further
mark their specific copyright on a particular contribution, they should
indicate their copyright solely in the commit message of the change when it is
committed.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
America and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <type_traits>
#include <utility>
namespace tv {
#ifdef __cpp_lib_void_t
template <class T> using void_t = std::void_t<T>;
#else
// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
// (it takes CWG1558 into account and also works for older compilers)
template <typename... Ts> struct make_void { typedef void type; };
template <typename... Ts> using void_t = typename make_void<Ts...>::type;
#endif
namespace detail {
struct _identity final {
template <class T> using type_identity = T;
template <class T> decltype(auto) operator()(T &&arg) {
return std::forward<T>(arg);
}
};
template <class Func, class Enable = void>
struct function_takes_identity_argument : std::false_type {};
#if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t
// instead of std::void_t. But we're only building on MSVC versions that have
// std::void_t, so let's just use that one.
template <class Func>
struct function_takes_identity_argument<
Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#else
template <class Func>
struct function_takes_identity_argument<
Func, void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#endif
template <bool Condition> struct _if_constexpr;
template <> struct _if_constexpr<true> final {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return thenCallback(_identity());
}
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
return thenCallback();
}
};
template <> struct _if_constexpr<false> final {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return elseCallback(_identity());
}
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
return elseCallback();
}
};
} // namespace detail
/*
* Get something like C++17 if constexpr in C++14.
*
* Example 1: simple constexpr if/then/else
* template<int arg> int increment_absolute_value() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* [&] { ++result; } // then-case
* [&] { --result; } // else-case
* );
* return result;
* }
*
* Example 2: without else case (i.e. conditionally prune code from assembly)
* template<int arg> int decrement_if_positive() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* // This decrement operation is only present in the assembly for
* // template instances with arg > 0.
* [&] { --result; }
* );
* return result;
* }
*
* Example 3: branch based on type (i.e. replacement for SFINAE)
* struct MyClass1 {int value;};
* struct MyClass2 {int val};
* template <class T>
* int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T ==
* MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T ==
* MyClass1
* );
* }
*
* Note: The _ argument passed in Example 3 is the identity function, i.e. it
* does nothing. It is used to force the compiler to delay type checking,
* because the compiler doesn't know what kind of _ is passed in. Without it,
* the compiler would fail when you try to access t.value but the member doesn't
* exist.
*
* Note: In Example 3, both branches return int, so func() returns int. This is
* not necessary. If func() had a return type of "auto", then both branches
* could return different types, say func<MyClass1>() could return int and
* func<MyClass2>() could return string.
*/
template <bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback &&thenCallback,
ElseCallback &&elseCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
} else {
if constexpr (detail::function_takes_identity_argument<
ElseCallback>::value) {
return std::forward<ElseCallback>(elseCallback)(detail::_identity());
} else {
return std::forward<ElseCallback>(elseCallback)();
}
}
#else
// C++14 implementation of if constexpr
return detail::_if_constexpr<Condition>::call(
std::forward<ThenCallback>(thenCallback),
std::forward<ElseCallback>(elseCallback));
#endif
}
template <bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
}
#else
// C++14 implementation of if constexpr
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
[](auto) {});
#endif
}
} // namespace tv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <sstream>
#ifdef TV_USE_STACKTRACE
#if defined(WIN32) || defined(_WIN32) || \
defined(__WIN32) && !defined(__CYGWIN__)
#define BOOST_STACKTRACE_USE_WINDBG
#else
// require linking with -ldl and -lbacktrace in linux
#define BOOST_STACKTRACE_USE_BACKTRACE
#endif
#include <boost/stacktrace.hpp>
#endif
namespace tv {
template <class SStream, class T> void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream ss;
sstream_print(ss, args...);
std::cout << ss.str() << std::endl;
}
#ifdef TV_USE_STACKTRACE
#define TV_BACKTRACE_PRINT(ss) \
ss << std::endl << boost::stacktrace::stacktrace();
#else
#define TV_BACKTRACE_PRINT(ss)
#endif
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_THROW_INVALID_ARG(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
} // namespace tv
\ No newline at end of file
#pragma once
// from pytorch.aten
#include "tensorview.h"
#include <type_traits>
namespace tv {
namespace cuda {
template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
return (a + b - 1) / b;
}
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int getNumThreads(const int N) {
if (N > CUDA_NUM_THREADS) {
return CUDA_NUM_THREADS;
}
return DivUp(N, 32) * 32;
}
inline int getBlocks(const int N) {
TV_ASSERT_RT_ERR(N > 0,
"CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, getNumThreads(N));
}
} // namespace cuda
} // namespace tv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <eigen3/Eigen/Dense>
namespace tv {
template <typename T, int Row = Eigen::Dynamic, int Col = Eigen::Dynamic>
Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>>
tv2eigen(TensorView<T> view) {
TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error");
if (Row != Eigen::Dynamic) {
TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error");
}
if (Col != Eigen::Dynamic) {
TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error");
}
int row = 1;
if (view.ndim() == 2) {
row = view.dim(0);
}
Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>> eigen_map(
view.data(), row, view.dim(1));
return eigen_map;
}
} // namespace tv
#pragma once
// from tensorflow
namespace tv {
namespace detail {
template <typename T> class KernelLoop {
struct Iterator {
__forceinline__ __device__ Iterator(T index, T delta)
: index_(index), delta_(delta) {}
__forceinline__ __device__ T operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++() {
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if (!other.delta_) {
return less;
}
if (!delta_) {
return greater;
}
return less || greater;
}
private:
T index_;
const T delta_;
};
public:
__forceinline__ __device__ KernelLoop(T begin, T delta, T end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
T begin_;
T delta_;
T end_;
};
} // namespace detail
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
\ No newline at end of file
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>
namespace tv {
template <class... T> struct mp_list {};
template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;
template <int... I>
using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
namespace detail {
template <class... Ts, class F>
constexpr F mp_for_each_impl(mp_list<Ts...>, F &&f) {
return (void)(std::initializer_list<int>{(f(Ts()), 0)...}),
std::forward<F>(f);
}
template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
return std::forward<F>(f);
}
} // namespace detail
template <class... T>
using mp_length = std::integral_constant<std::size_t, sizeof...(T)>;
namespace detail {
template <class A, template <class...> class B> struct mp_rename_impl {
// An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list
};
template <template <class...> class A, class... T, template <class...> class B>
struct mp_rename_impl<A<T...>, B> {
using type = B<T...>;
};
} // namespace detail
template <class A, template <class...> class B>
using mp_rename = typename detail::mp_rename_impl<A, B>::type;
template <class L> using mp_size = mp_rename<L, mp_length>;
template <class L, class F> constexpr F mp_for_each(F &&f) {
return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
}
} // namespace tv
#endif
\ No newline at end of file
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T> struct has_const_iterator : private sfinae_base {
private:
template <typename C> static yes &test(typename C::const_iterator *);
template <typename C> static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T> struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C> static no &f(...);
template <typename C>
static yes &
g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C> static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar> struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar> struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specificy a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U> struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end)
break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I> struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void
tuple_print(const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N> struct is_container<char[N]> : std::false_type {};
template <typename T> struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T> struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T> struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char> delimiters<::std::multiset<T, TComp, TAllocator>,
char>::values = {"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args> struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args> struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims> struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &
operator<<(std::basic_ostream<TChar, TCharTraits> &s,
const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T> struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T> struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <algorithm>
#include <array>
#include <iostream>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
namespace py = pybind11;
namespace tv {
template <typename Tarr> bool is_c_style(const Tarr &arr) {
return bool(arr.flags() & py::array::c_style);
}
template <typename T, int Rank = -1>
TensorView<T, Rank> arrayt2tv(py::array_t<T> arr) {
TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
if (Rank >= 0) {
TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
}
return TensorView<T, Rank>(arr.mutable_data(), shape);
}
template <typename T, int Rank = -1>
TensorView<const T> carrayt2tv(py::array_t<T> arr) {
TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
if (Rank >= 0) {
TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
}
return TensorView<const T, Rank>(arr.data(), shape);
}
template <typename Tarr> tv::DType get_array_tv_dtype(const Tarr &arr) {
switch (arr.dtype().kind()) {
case 'b':
return tv::bool_;
case 'i': {
switch (arr.itemsize()) {
case 1:
return tv::int8;
case 2:
return tv::int16;
case 4:
return tv::int32;
case 8:
return tv::int64;
default:
break;
}
}
case 'u': {
switch (arr.itemsize()) {
case 1:
return tv::uint8;
case 2:
return tv::uint16;
case 4:
return tv::uint32;
case 8:
return tv::uint64;
default:
break;
}
}
case 'f': {
switch (arr.itemsize()) {
case 2:
return tv::float16;
case 4:
return tv::float32;
case 8:
return tv::float64;
default:
break;
}
}
}
TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
}
template <typename Tarr> Tensor array2tensor(Tarr &arr) {
TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
TensorShape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
}
template <typename T> Tensor arrayt2tensor(py::array_t<T> &arr) {
TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
TensorShape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, tv::type_v<T>, -1);
}
template <typename TDType> py::dtype tv_dtype_to_py(TDType d) {
switch (d) {
case float32:
return py::dtype("float32");
case float64:
return py::dtype("float64");
case float16:
return py::dtype("float16");
case int32:
return py::dtype("int32");
case int16:
return py::dtype("int16");
case int8:
return py::dtype("int8");
case int64:
return py::dtype("int64");
case uint32:
return py::dtype("uint32");
case uint16:
return py::dtype("uint16");
case uint8:
return py::dtype("uint8");
case uint64:
return py::dtype("uint64");
case bool_:
return py::dtype("bool_");
default:;
}
TV_THROW_INVALID_ARG("unknown dtype", d);
}
// add template to define function in header
template <typename Ttensor> py::array tensor2array(Ttensor &tensor) {
// you cant call this function during GIL released.
TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
auto shape = tensor.shape();
std::vector<int> shape_vec(shape.begin(), shape.end());
auto dtype = tv_dtype_to_py(tensor.dtype());
// construct py::array will copy content from ptr.
// its expected because we can't transfer ownership from
// c++ tv::Tensor to numpy array when c++ object is deleted.
return py::array(dtype, shape_vec, {}, tensor.raw_data());
}
} // namespace tv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
tv::Tensor is a lightweight header-only tensor container
without template and annoying dependencies. no algorithm is implemented.
it should only be used when you want a no-template simple container but
dont want to link with libtorch.
If you can use libtorch, dont use tv::Tensor.
*/
#pragma once
#include "cc17.h"
#include "mp_helper.h"
#include "tensorview.h"
#include <cstring>
#include <iomanip>
#include <memory>
#include <type_traits>
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
namespace tv {
enum DType {
float32,
int32,
int16,
int8,
float64,
bool_,
uint8,
float16,
int64,
uint16,
uint32,
uint64
};
namespace detail {
using dtype_collection_t =
tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
float16, int64, uint16, uint32, uint64>;
#ifdef TV_CUDA
using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>;
#else
using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>;
#endif
template <typename T> class TensorStorage {
public:
TensorStorage(size_t size, int device = -1, bool managed = false,
bool pinned = false)
: mSize(size), device_(device), managed_(managed), pinned_(pinned) {
if (size == 0) {
mPtr = nullptr;
} else {
if (device == -1) {
if (pinned_) {
#ifdef TV_CUDA
checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
#else
TV_THROW_INVALID_ARG("you need to define TV_CUDA to use pinned");
#endif
} else {
mPtr = new T[size];
}
} else {
#ifdef TV_CUDA
// we should select device in external
/*
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
*/
if (managed) {
checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
} else {
checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
}
#else
TV_THROW_INVALID_ARG("don't compiled with cuda");
#endif
}
}
}
TensorStorage(T *ptr, size_t size, int device)
: mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
virtual ~TensorStorage() {
if (empty()) {
return;
}
if (from_blob_) {
return;
}
if (device_ == -1) {
if (pinned_) {
#ifdef TV_CUDA
cudaFreeHost(mPtr);
#endif
} else {
delete[] mPtr;
}
} else {
#ifdef TV_CUDA
cudaFree(mPtr);
#endif
}
};
inline size_t size() const { return mSize; }
T *data() { return mPtr; }
const T *data() const { return mPtr; }
bool empty() const { return mPtr == nullptr || mSize == 0; }
bool managed() const { return managed_; }
bool pinned() const { return pinned_; }
int device() const { return device_; }
void zero_() {
if (device_ == -1) {
std::memset(data(), 0, mSize);
// std::fill(data(), data() + mSize, 0);
} else {
#ifdef TV_CUDA
checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
#else
TV_THROW_INVALID_ARG("don't compiled with cuda");
#endif
}
}
private:
size_t mSize = 0;
T *mPtr = nullptr;
bool from_blob_ = false;
int device_ = -1;
bool managed_ = false;
bool pinned_ = false;
};
template <typename T> size_t sizeof_dtype(T dtype) {
switch (dtype) {
case float32:
return sizeof(float);
case int8:
return sizeof(int8_t);
case int16:
return sizeof(int16_t);
case int32:
return sizeof(int32_t);
case float64:
return sizeof(double);
case int64:
return sizeof(int64_t);
case bool_:
return sizeof(bool);
case uint8:
return sizeof(uint8_t);
case uint16:
return sizeof(uint16_t);
case uint32:
return sizeof(uint32_t);
case uint64:
return sizeof(uint64_t);
case float16:
return 2;
default:
TV_THROW_RT_ERR("unsupported dtype");
}
return 0;
}
template <typename T> std::string typeString(T t) {
switch (t) {
case DType::bool_:
return "bool";
case DType::float32:
return "float32";
case DType::int8:
return "int8";
case DType::int16:
return "int16";
case DType::int32:
return "int32";
case DType::float64:
return "float64";
case DType::int64:
return "int64";
case DType::uint8:
return "uint8";
case DType::uint16:
return "uint16";
case DType::uint32:
return "uint32";
case DType::uint64:
return "uint64";
case DType::float16:
return "half";
default:
return "";
}
}
template <typename T> struct TypeToDtypeTraits;
template <> struct TypeToDtypeTraits<int32_t> {
static constexpr DType dtype = int32;
};
#ifdef TV_CUDA
template <> struct TypeToDtypeTraits<__half> {
static constexpr DType dtype = float16;
};
#endif
template <> struct TypeToDtypeTraits<float> {
static constexpr DType dtype = float32;
};
template <> struct TypeToDtypeTraits<double> {
static constexpr DType dtype = float64;
};
template <> struct TypeToDtypeTraits<int16_t> {
static constexpr DType dtype = int16;
};
template <> struct TypeToDtypeTraits<int8_t> {
static constexpr DType dtype = int8;
};
template <> struct TypeToDtypeTraits<int64_t> {
static constexpr DType dtype = int64;
};
template <> struct TypeToDtypeTraits<uint8_t> {
static constexpr DType dtype = uint8;
};
template <> struct TypeToDtypeTraits<uint16_t> {
static constexpr DType dtype = uint16;
};
template <> struct TypeToDtypeTraits<uint32_t> {
static constexpr DType dtype = uint32;
};
template <> struct TypeToDtypeTraits<uint64_t> {
static constexpr DType dtype = uint64;
};
template <> struct TypeToDtypeTraits<bool> {
static constexpr DType dtype = bool_;
};
template <> struct TypeToDtypeTraits<const int32_t> {
static constexpr DType dtype = int32;
};
#ifdef TV_CUDA
template <> struct TypeToDtypeTraits<const __half> {
static constexpr DType dtype = float16;
};
#endif
template <> struct TypeToDtypeTraits<const float> {
static constexpr DType dtype = float32;
};
template <> struct TypeToDtypeTraits<const double> {
static constexpr DType dtype = float64;
};
template <> struct TypeToDtypeTraits<const int16_t> {
static constexpr DType dtype = int16;
};
template <> struct TypeToDtypeTraits<const int8_t> {
static constexpr DType dtype = int8;
};
template <> struct TypeToDtypeTraits<const int64_t> {
static constexpr DType dtype = int64;
};
template <> struct TypeToDtypeTraits<const uint8_t> {
static constexpr DType dtype = uint8;
};
template <> struct TypeToDtypeTraits<const uint16_t> {
static constexpr DType dtype = uint16;
};
template <> struct TypeToDtypeTraits<const uint32_t> {
static constexpr DType dtype = uint32;
};
template <> struct TypeToDtypeTraits<const uint64_t> {
static constexpr DType dtype = uint64;
};
template <> struct TypeToDtypeTraits<const bool> {
static constexpr DType dtype = bool_;
};
} // namespace detail
template <class T> constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
template <class... Ts, typename F> bool dispatch_noexcept(DType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t && notFound) {
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
return !notFound;
}
template <class... Ts, typename F> void dispatch(DType t, F &&f) {
if (!dispatch_noexcept<Ts...>(t, std::forward<F>(f))) {
std::stringstream ss;
mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
ss << detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", detail::typeString(t),
", available:", ss.str());
}
}
template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
if (T(I) == idx && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
mp_for_each<mp_list_c<T, Is...>>([=, &ss](auto I) { ss << T(I) << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
template <int... Is, typename F> bool dispatch_int_noexcept(int idx, F &&f) {
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
if (decltype(I)::value == idx && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
return !notFound;
}
template <int... Is, typename F, class BinaryPredicate>
bool dispatch_int_noexcept(int idx, BinaryPredicate p, F &&f) {
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
if (p(idx, decltype(I)::value) && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
return !notFound;
}
template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
if (!dispatch_int_noexcept<Is...>(idx, std::forward<F>(f))) {
std::stringstream ss;
mp_for_each<mp_list_c<int, Is...>>(
[=, &ss](auto I) { ss << decltype(I)::value << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
template <int... Is, typename F, class BinaryPredicate>
void dispatch_int(int idx, BinaryPredicate p, F &&f) {
// BinaryPredicate: BinaryPredicate(idx, candidate)
if (!dispatch_int_noexcept<Is...>(idx, p, std::forward<F>(f))) {
std::stringstream ss;
mp_for_each<mp_list_c<int, Is...>>(
[=, &ss](auto I) { ss << decltype(I)::value << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
// Ts is pack of mp_list_c
template <class... Ts, typename Iterator, typename F>
bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
static_assert(sizeof...(Ts) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
using val_lst_t = decltype(I);
auto val_lst_size = mp_size<val_lst_t>::value;
bool equal = true;
std::size_t count = 0;
auto iter = begin;
mp_for_each<val_lst_t>([&](auto E) {
if (iter == end || !equal) {
return;
}
if (count >= val_lst_size) {
equal = false;
return;
}
constexpr auto c = decltype(E)::value;
if (c != *iter) {
equal = false;
}
++count;
std::advance(iter, 1);
});
if (count != val_lst_size || iter != end) {
equal = false;
}
if (equal && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
return !notFound;
}
template <class... Ts, typename Iterator, typename F>
void dispatch_container(Iterator begin, Iterator end, F &&f) {
if (!dispatch_container_noexcept<Ts...>(begin, end, std::forward<F>(f))) {
std::stringstream ss;
ss << "unknown value [";
for (auto iter = begin; iter != end; std::advance(iter, 1)) {
ss << *iter << ",";
}
ss << "], available: ";
mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
ss << "[";
mp_for_each<decltype(I)>(
[=, &ss](auto E) { ss << decltype(E)::value << ","; });
ss << "]";
});
TV_THROW_RT_ERR(ss.str());
}
}
/*
template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
return dispatch_scalar<int, Is...>(idx, f);
}
*/
template <class T> struct Dispatch;
template <template <class...> class T, class... Args>
struct Dispatch<T<Args...>> {
template <typename F> inline void operator()(DType t, F &&f) {
return dispatch<Args...>(t, std::forward<F>(f));
}
};
template <class T> struct DispatchContainer;
template <template <class...> class T, class... Args>
struct DispatchContainer<T<Args...>> {
template <typename Iterator, typename F>
inline void operator()(Iterator begin, Iterator end, F &&f) {
return dispatch_container<Args...>(begin, end, std::forward<F>(f));
}
};
template <class T> struct DispatchContainerNoexcept;
template <template <class...> class T, class... Args>
struct DispatchContainerNoexcept<T<Args...>> {
template <typename Iterator, typename F>
inline bool operator()(Iterator begin, Iterator end, F &&f) {
return dispatch_container_noexcept<Args...>(begin, end, std::forward<F>(f));
}
};
template <class T> struct DispatchInt;
// Args should be std::integral_constant<int, value>
// you need to use type_container<std::integral_constant<int, value>...>
// as template parameter of DispatchInt.
// tv::mp_list_c is ok.
template <template <class...> class T, class... Args>
struct DispatchInt<T<Args...>> {
template <typename F> inline void operator()(int t, F &&f) {
return dispatch_int<Args::value...>(t, std::forward<F>(f));
}
template <typename F, typename BinaryPredicate>
inline void operator()(int t, BinaryPredicate p, F &&f) {
return dispatch_int<Args::value...>(t, p, std::forward<F>(f));
}
};
template <class T> struct DispatchIntNoexcept;
template <template <class...> class T, class... Args>
struct DispatchIntNoexcept<T<Args...>> {
template <typename F> inline bool operator()(int t, F &&f) {
return dispatch_int_noexcept<Args::value...>(t, std::forward<F>(f));
}
template <typename F, typename BinaryPredicate>
inline bool operator()(int t, BinaryPredicate p, F &&f) {
return dispatch_int_noexcept<Args::value...>(t, p, std::forward<F>(f));
}
};
constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
struct Tensor {
Tensor() {}
Tensor(TensorShape shape, TensorShape stride, DType dtype, int device = -1,
bool pinned = false, bool managed = false)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
shape_ = shape;
stride_ = stride;
}
Tensor(TensorShape shape, DType dtype, int device = -1, bool pinned = false,
bool managed = false)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
Tensor(void *ptr, TensorShape shape, TensorShape stride, DType dtype,
int device = -1)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = stride;
}
Tensor(void *ptr, TensorShape shape, DType dtype, int device = -1)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
Tensor(const void *ptr, TensorShape shape, TensorShape stride, DType dtype,
int device = -1)
: dtype_(dtype), writeable_(false) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = stride;
}
Tensor(const void *ptr, TensorShape shape, DType dtype, int device = -1)
: dtype_(dtype), writeable_(false) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
Tensor(std::initializer_list<int32_t> init)
: Tensor({int(init.size())}, tv::int32) {
std::copy(init.begin(), init.end(), data<int32_t>());
}
Tensor(std::initializer_list<int64_t> init)
: Tensor({int(init.size())}, tv::int64) {
std::copy(init.begin(), init.end(), data<int64_t>());
}
Tensor(std::initializer_list<float> init)
: Tensor({int(init.size())}, tv::float32) {
std::copy(init.begin(), init.end(), data<float>());
}
Tensor(std::initializer_list<double> init)
: Tensor({int(init.size())}, tv::float64) {
std::copy(init.begin(), init.end(), data<double>());
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<(Rank > 0), int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() {
using tv_shape_t =
typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
writable_check();
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
tv_shape_t shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return if_constexpr<(Rank > 0)>(
[&](auto _) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
shape, stride);
},
[&](auto _) {
ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
shape, stride);
});
}
template <class... Inds> Tensor view(Inds... newShapes) const {
static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
TensorShape shape{int(newShapes)...};
bool found_minus_1 = false;
for (size_t i = 0; i < shape.ndim(); ++i) {
if (!found_minus_1) {
if (shape[i] == -1) {
shape[i] = 1;
shape[i] = size() / shape.size();
found_minus_1 = true;
} else {
TV_ASSERT_INVALID_ARG(shape[i] > 0,
"shape except -1 must larger than 0");
}
} else {
TV_ASSERT_INVALID_ARG(shape[i] > 0, "multiple -1 in your argument.");
}
}
TV_ASSERT_RT_ERR(shape.size() == size(), "error");
Tensor res(*this);
res.shape_ = shape;
res.stride_ = shape.stride_rowmajor();
return res;
}
Tensor view(TensorShape shape) const {
TV_ASSERT_RT_ERR(shape.size() == size(), "error");
Tensor res(*this);
res.shape_ = shape;
res.stride_ = shape.stride_rowmajor();
return res;
}
Tensor operator[](int64_t index) {
TV_ASSERT_INVALID_ARG(ndim() > 1, "error");
if (index < 0) {
index += dim(0);
}
TV_ASSERT_INVALID_ARG(index < dim(0), "error");
Tensor res = Tensor();
res.storage_ = storage_;
res.shape_ = shape_.subshape(1);
res.offset_ = offset_ + index * stride_[0];
res.stride_ = stride_.subshape(1);
res.writeable_ = writeable_;
return res;
}
Tensor squeeze() const { return view(shape_.squeeze()); }
Tensor squeeze(int axis) const {
if (axis < 0) {
axis = ndim() + axis;
}
return view(shape_.squeeze(axis));
}
Tensor unsqueeze(int axis) const {
if (axis < 0) {
axis = ndim() + axis;
}
return view(shape_.unsqueeze(axis));
}
bool pinned() const { return storage_->pinned(); }
Tensor slice_first_axis(int start, int end) const {
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
if (start < 0) {
start = shape_[0] + start;
}
if (end < 0) {
end = shape_[0] + end;
}
TV_ASSERT_INVALID_ARG(start < shape_[0], "start must small than dim 0");
TV_ASSERT_INVALID_ARG(start < end, "start must small than end");
size_t new_offset = start * shape_.prod(1) * itemsize();
Tensor res(*this);
TensorShape newshape(shape_);
newshape[0] = end - start;
res.shape_ = newshape;
res.stride_ = stride_;
res.offset_ = new_offset;
return res;
}
bool empty() const { return storage_->empty(); }
DType dtype() const { return dtype_; }
int device() const { return storage_->device(); }
size_t ndim() const { return shape_.ndim(); }
const TensorShape &shape() const { return shape_; }
const TensorShape &sizes() const { return shape_; }
const TensorShape &stride() const { return stride_; }
int dim(int idx) const {
if (idx < 0) {
TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
return shape_[shape_.ndim() + idx];
} else {
TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
return shape_[idx];
}
}
const uint8_t *raw_data() const { return storage_->data() + offset_; }
size_t raw_size() const { return size() * itemsize(); }
size_t size() const { return shape_.size(); }
size_t size(int64_t idx) const { return dim(idx); }
size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
Tensor &zero_() {
writable_check();
storage_->zero_();
return *this;
}
uint8_t *raw_data() {
writable_check();
return storage_->data() + offset_;
}
template <typename T> Tensor &fill_(T value) {
writable_check();
TV_ASSERT_RT_ERR(device() == -1, "error");
Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto I) {
using Treal = decltype(I);
if (std::is_convertible<T, Treal>::value) {
auto ptr = reinterpret_cast<Treal *>(raw_data());
std::fill(ptr, ptr + size(), Treal(value));
} else {
TV_THROW_INVALID_ARG("not convertable from", type_s<T>, "to",
type_s<Treal>);
}
});
return *this;
}
template <typename T> T *data() {
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
writable_check();
return reinterpret_cast<T *>(raw_data());
}
template <typename T> const T *data() const {
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<const T *>(raw_data());
}
template <typename T> T *data_ptr() { return data<T>(); }
template <typename T> const T *data_ptr() const { return data<T>(); }
void *data_ptr() { return reinterpret_cast<void *>(raw_data()); }
const void *data_ptr() const {
return reinterpret_cast<const void *>(raw_data());
}
void copy_(const Tensor &tensor) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
detail::typeString(dtype()),
detail::typeString(tensor.dtype()));
if (device() == -1 && tensor.device() == -1) {
#ifdef TV_CUDA
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
#else
std::copy(tensor.raw_data(),
tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
storage_->data());
#endif
}
#ifdef TV_CUDA
else if (device() >= 0 && tensor.device() == -1) {
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
} else if (device() == -1 && tensor.device() >= 0) {
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
} else if (device() >= 0 && tensor.device() >= 0) {
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
#endif
else {
TV_THROW_RT_ERR("only support cpu tensor");
}
}
#ifdef TV_CUDA
void copy_(const Tensor &tensor, cudaStream_t stream) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
detail::typeString(dtype()),
detail::typeString(tensor.dtype()));
if (device() == -1 && tensor.device() == -1) {
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() >= 0 && tensor.device() == -1) {
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() == -1 && tensor.device() >= 0) {
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() >= 0 && tensor.device() >= 0) {
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else {
TV_THROW_RT_ERR("only support cpu tensor");
}
}
#endif
Tensor cpu() const {
if (storage_->device() == -1) {
// cpu() should always copy tensor.
return clone();
}
Tensor res(shape_, stride_, dtype_, -1, storage_->managed());
res.copy_(*this);
return res;
}
template <typename T> void copy_(const TensorView<T> &tensor, int device) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
Tensor src = from_blob(tensor, device);
return copy_(src);
}
Tensor &operator=(const Tensor &tensor) {
dtype_ = tensor.dtype_;
storage_ = tensor.storage_;
shape_ = tensor.shape_;
writeable_ = tensor.writeable_;
offset_ = tensor.offset_;
stride_ = tensor.stride_;
return *this;
}
Tensor(const Tensor &tensor) {
dtype_ = tensor.dtype_;
storage_ = tensor.storage_;
shape_ = tensor.shape_;
writeable_ = tensor.writeable_;
offset_ = tensor.offset_;
stride_ = tensor.stride_;
}
Tensor clone(bool pinned = false) const {
TV_ASSERT_RT_ERR(!empty(), "clone a empty tensor");
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
Tensor newtensor(shape_, stride_, dtype_, device(), pinned,
storage_->managed());
newtensor.copy_(*this);
return newtensor;
}
Tensor astype(DType dtype) {
if (dtype == dtype_) {
return clone();
}
TV_ASSERT_INVALID_ARG(device() == -1, "only support cpu tensor");
TV_ASSERT_INVALID_ARG(!empty(), "can't be used in empty tensor");
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
auto tensor = Tensor();
Dispatch<detail::all_tensor_types_t>()(dtype, [&](auto Idst) {
using Tdst = decltype(Idst);
Dispatch<detail::all_tensor_types_t>()(this->dtype_, [&](auto Icur) {
using Tcur = decltype(Icur);
if (std::is_convertible<Tcur, Tdst>::value) {
auto ptr = this->data<Tcur>();
tensor = Tensor(this->shape_, this->stride_, dtype, this->device(),
this->pinned(), this->storage_->managed());
std::copy(ptr, ptr + this->size(), tensor.data<Tdst>());
} else {
TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
type_s<Tdst>);
}
});
});
return tensor;
}
template <class... Ts, typename F> inline void dispatch(F &&f) {
return tv::dispatch<Ts...>(dtype_, std::forward<F>(f));
}
protected:
inline void writable_check() {
TV_ASSERT_RT_ERR(writeable_,
"you cant do non-const operation when not writable");
}
DType dtype_;
std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
TensorShape shape_;
size_t offset_ = 0;
TensorShape stride_;
private:
bool writeable_ = true;
bool contiguous_ = true;
};
template <typename Os> Os &operator<<(Os &os, const Tensor &tensor) {
TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
Dispatch<detail::all_tensor_types_t>()(tensor.dtype(), [&](auto I) {
using T = decltype(I);
std::stringstream ss;
if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
ss << std::setprecision(4);
}
os << tensor.tview<T, -1, DefaultPtrTraits, int64_t>().repr(ss);
});
return os;
}
inline Tensor from_blob(void *ptr, TensorShape shape, DType dtype, int device) {
return Tensor(ptr, shape, dtype, device);
}
inline Tensor from_blob(const void *ptr, TensorShape shape, DType dtype,
int device) {
return Tensor(ptr, shape, dtype, device);
}
} // namespace tv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "common.h"
#include "prettyprint.h"
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <iterator>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>
#ifdef TV_CUDA
#include <cuda_runtime_api.h>
#endif
namespace tv {
#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR_V2(...) \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#ifdef TV_CUDA
struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {}
virtual cudaStream_t getStream() const { return mStream; }
cudaStream_t mStream = 0;
};
#endif
struct CPU {};
#ifndef TV_MAX_DIM
#define TV_MAX_DIM 6
#endif
template <typename T> struct DefaultPtrTraits { typedef T *type; };
#if defined(__CUDACC__) || defined(__HIPCC__)
template <typename T> struct RestrictPtrTraits {
typedef T *__restrict__ type;
};
#endif
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
{
if (ndim * sizeof(T) == 1)
return 1;
else if (ndim * sizeof(T) == 2)
return 2;
else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
return 4;
else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
return 8;
else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
return 16;
else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
return 32;
else
return 64;
}
*/
namespace detail {
template <typename _InIter>
using _RequireInputIter = typename std::enable_if<std::is_convertible<
typename std::iterator_traits<_InIter>::iterator_category,
std::input_iterator_tag>::value>::type;
}
template <typename T, size_t MaxDim = TV_MAX_DIM>
struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
public:
TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(size_t count, T init = T())
: size_(count) {
for (size_t i = 0; i < count; ++i) {
array_[i] = init;
}
};
template <typename Iterator, typename = detail::_RequireInputIter<Iterator>>
SimpleVector(Iterator first, Iterator last) {
size_ = 0;
for (; first != last; ++first) {
if (size_ >= MaxDim) {
TV_THROW_INVALID_ARG("iterator too long");
}
array_[size_++] = *first;
}
};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
TV_ASSERT(q.size() <= MaxDim);
size_ = 0;
for (T s : q) {
array_[size_++] = s;
}
size_ = q.size();
}
SimpleVector(const std::vector<T> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
array_[i] = arr[i];
}
size_ = arr.size();
}
TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
array_[i] = arr[i];
}
size_ = arr.size();
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < size_);
#endif
return array_[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < size_);
#endif
return array_[idx];
}
TV_HOST_DEVICE_INLINE void push_back(T s) {
#ifdef TV_DEBUG
TV_ASSERT(size_ < MaxDim);
#endif
array_[size_] = s;
size_++;
}
TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
TV_ASSERT(size_ > 0);
#endif
size_--;
}
TV_HOST_DEVICE_INLINE size_t size() const { return size_; }
TV_HOST_DEVICE_INLINE const T *data() const { return array_; }
TV_HOST_DEVICE_INLINE T *data() { return array_; }
TV_HOST_DEVICE_INLINE size_t empty() const { return size_ == 0; }
typedef size_t size_type;
class iterator {
public:
typedef iterator self_type;
typedef T value_type;
typedef T &reference;
typedef T *pointer;
typedef std::forward_iterator_tag iterator_category;
typedef std::ptrdiff_t difference_type;
TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
class const_iterator {
public:
typedef const_iterator self_type;
typedef T value_type;
typedef const T &reference;
typedef const T *pointer;
typedef std::ptrdiff_t difference_type;
typedef std::forward_iterator_tag iterator_category;
TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(array_); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(array_ + size_); }
TV_HOST_DEVICE_INLINE const_iterator begin() const {
return const_iterator(array_);
}
TV_HOST_DEVICE_INLINE const_iterator end() const {
return const_iterator(array_ + size_);
}
TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
return const_iterator(array_);
}
TV_HOST_DEVICE_INLINE const_iterator cend() const {
return const_iterator(array_ + size_);
}
protected:
T array_[MaxDim];
size_t size_ = 0;
};
template <typename T, size_t MaxDim>
bool operator==(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) {
if (lfs.size() != rfs.size())
return false;
for (size_t i = 0; i < lfs.size(); ++i) {
if (lfs[i] != rfs[i])
return false;
}
return true;
}
template <typename T, size_t MaxDim>
bool operator!=(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) {
return !(lfs == rfs);
}
struct Slice {
template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...};
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
for (size_t i = 0; i < slices.size(); ++i) {
slices_[i] = slices[i];
}
}
TV_HOST_DEVICE_INLINE Slice() {
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
}
template <typename T>
TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
TV_ASSERT(slice.size() <= 3);
int idx = 0;
for (T s : slice) {
slices_[idx] = int(s);
++idx;
}
}
TV_HOST_DEVICE_INLINE int &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return slices_[idx];
}
TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return slices_[idx];
}
protected:
int slices_[3];
};
template <size_t MaxDim = TV_MAX_DIM, typename Tindex = int>
struct ShapeBase : public SimpleVector<Tindex, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<Tindex, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<Tindex> shape)
: SimpleVector<Tindex, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<Tindex, MaxDim> vec)
: SimpleVector<Tindex, MaxDim>(vec) {}
template <typename T, template <class...> class Container>
ShapeBase(Container<T> shape) : SimpleVector<Tindex, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
: SimpleVector<Tindex, MaxDim>(shape) {}
ShapeBase(const std::vector<Tindex> &arr)
: SimpleVector<Tindex, MaxDim>(arr) {}
ShapeBase<MaxDim, Tindex> &
operator=(const ShapeBase<MaxDim, Tindex> &shape) = default;
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start,
Tindex end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end <= this->size_ && end > start);
#endif
ShapeBase<MaxDim, Tindex> shape;
for (Tindex i = start; i < end; ++i) {
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && start <= this->size_);
#endif
ShapeBase<MaxDim, Tindex> shape;
for (size_t i = start; i < this->size_; ++i) {
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE size_t size() const {
if (this->size_ == 0)
return 0;
size_t s = 1;
for (int i = 0; i < int(this->size_); ++i) {
s *= this->array_[i];
}
return s;
}
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->size_; }
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> squeeze() const {
ShapeBase<MaxDim, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (this->array_[i] != 1)
shape.push_back(this->array_[i]);
}
if (shape.empty()) {
// dont support empty shape for now
shape.push_back(1);
}
return shape;
}
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> squeeze(int dim) const {
static_assert(MaxDim2 >= MaxDim - 1, "error");
ShapeBase<MaxDim2, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (i != size_t(dim) || this->array_[i] != 1)
shape.push_back(this->array_[i]);
}
return shape;
}
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> unsqueeze(int dim) const {
static_assert(MaxDim2 >= MaxDim - 1, "error");
ShapeBase<MaxDim2, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (i == size_t(dim))
shape.push_back(1);
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE size_t prod(Tindex start = 0) const {
size_t res = 1;
for (size_t i = start; i < this->size_; ++i) {
res *= this->array_[i];
}
return res;
}
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> stride_rowmajor() {
static_assert(MaxDim2 >= MaxDim, "error");
Tindex p = Tindex(1);
ShapeBase<MaxDim2, Tindex> res(this->size_);
for (Tindex i = this->size_ - 1; i >= 0; --i) {
res[i] = p;
p *= this->array_[i];
}
return res;
}
};
using Shape = ShapeBase<TV_MAX_DIM, int>;
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#ifdef TV_DEBUG
TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
std::vector<int> &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = shape.size() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
const Shape &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
const Index *shape) {
unsigned offset = 0;
unsigned m = 1;
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = NDim - 1; i >= 0; --i) {
offset += m * indexes[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <typename Index>
TV_HOST_DEVICE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape, int ndim) {
for (int i = ndim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <int N> struct ArrayIndexRowMajorReverse {
template <typename TShape, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T index,
Ts... inds) {
return index +
shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
}
template <typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T index,
Ts... inds) {
return index +
shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
}
};
template <> struct ArrayIndexRowMajorReverse<1> {
template <typename TShape, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T idx) {
return idx;
}
template <typename T>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T idx) {
return idx;
}
};
template <int N, int Ndim> struct ArrayIndexRowMajor {
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template <typename TShape, typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
T index, Ts... inds) {
return ArrayIndexRowMajor<N - 1, Ndim>::run(
shape, (index + start) * shape[Ndim - N + 1], inds...);
}
template <typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned
runShape(const Shape &shape, Tinit start, T index, Ts... inds) {
return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
shape, (index + start) * shape[Ndim - N + 1], inds...);
}
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(
indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
}
};
template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
template <typename TShape, typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
T idx) {
return start + idx;
}
template <typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
Tinit start, T idx) {
return start + idx;
}
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return start + indexes[Ndim - 1];
}
};
template <> struct ArrayIndexRowMajor<0, 0> {
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start) {
return 0;
}
template <typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
Tinit start) {
return 0;
}
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return 0;
}
};
template <int N, int Ndim> struct ArrayIndexStride {
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template <typename TShape, typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
T index, Ts... inds) {
return ArrayIndexStride<N - 1, Ndim>::run(
stride, start + index * stride[Ndim - N + 1], inds...);
}
};
template <int Ndim> struct ArrayIndexStride<1, Ndim> {
template <typename TShape, typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
T idx) {
return start + idx * stride[Ndim - 1];
}
};
#if __cplusplus >= 201703L
template <size_t... N, class T, class... Ts>
TV_HOST_DEVICE_INLINE T array_index_stride(const T *stride, Ts... ids) {
return ((stride[N] * std::get<N>(std::forward_as_tuple(ids...))) + ...);
}
#endif
namespace detail {
template <typename T> struct TypeToString;
template <> struct TypeToString<bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<const bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<uint64_t> {
static constexpr const char *value = "uint64";
};
template <> struct TypeToString<const int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<const float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<const double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<const int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<const int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<const int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<const uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<const uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<const uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<const uint64_t> {
static constexpr const char *value = "uint64";
};
} // namespace detail
template <typename T>
constexpr const char *type_s = detail::TypeToString<T>::value;
namespace detail {
template <typename T, int Rank,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorAccesserBase {
static constexpr int rank_value = Rank;
using ptr_t = typename PtrTraits<T>::type;
static_assert(Rank > 0, "error");
explicit TV_HOST_DEVICE_INLINE TensorAccesserBase(ptr_t ptr,
const Tindex *stride_ptr)
: ptr_(ptr), stride_ptr_(stride_ptr) {}
TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
static_assert(sizeof...(inds) == Rank, "error");
return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
static_assert(sizeof...(inds) == Rank, "error");
return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
}
protected:
ptr_t ptr_;
const Tindex *stride_ptr_;
};
} // namespace detail
template <typename T, int Rank,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorAccesser
: public detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex> {
using ptr_t = typename PtrTraits<T>::type;
static_assert(Rank > 0, "error");
explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
const Tindex *stride_ptr)
: detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex>(ptr,
stride_ptr) {}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
operator[](int i) {
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
operator[](int i) const {
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
}
};
template <typename T, template <class> class PtrTraits, typename Tindex>
struct TensorAccesser<T, 1, PtrTraits, Tindex>
: public detail::TensorAccesserBase<T, 1, PtrTraits, Tindex> {
using ptr_t = typename PtrTraits<T>::type;
explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
const Tindex *stride_ptr)
: detail::TensorAccesserBase<T, 1, PtrTraits, Tindex>(ptr, stride_ptr) {}
TV_HOST_DEVICE_INLINE T &operator[](int i) {
return this->ptr_[this->stride_ptr_[0] * i];
}
TV_HOST_DEVICE_INLINE T &operator[](int i) const {
return this->ptr_[this->stride_ptr_[0] * i];
}
};
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorView {
static constexpr int rank_value = Rank;
using ptr_t = typename PtrTraits<T>::type;
using tv_shape_t = ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex>;
using no_cv_type = typename std::remove_cv<T>::type;
static_assert(Rank == -1 || Rank > 0, "error");
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape)
: ptr_(ptr), shape_(shape), stride_(shape.stride_rowmajor()) {}
explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape,
tv_shape_t stride)
: ptr_(ptr), shape_(shape), stride_(stride) {}
operator TensorView<const no_cv_type, Rank, PtrTraits, Tindex>() {
return TensorView<const no_cv_type, Rank, PtrTraits, Tindex>(ptr_, shape_);
} // conversion function
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
#if defined TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
shape_.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
shape_[i]);
}
#endif
constexpr int Ndim = sizeof...(Inds);
return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
#if defined TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
shape_.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
shape_[i]);
}
#endif
constexpr int Ndim = sizeof...(Inds);
return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
}
TV_HOST_DEVICE_INLINE T &operator()() {
static_assert(Rank == -1 || 0 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
shape_.ndim());
#endif
return ptr_[0];
}
TV_HOST_DEVICE_INLINE const T &operator()() const {
static_assert(Rank == -1 || 0 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
shape_.ndim());
#endif
return ptr_[0];
}
template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
static_assert(Rank == -1 || 1 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, shape_[0]);
#endif
return ptr_[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
static_assert(Rank == -1 || 2 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
#endif
return ptr_[i1 * shape_[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
static_assert(Rank == -1 || 3 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
#endif
return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
static_assert(Rank == -1 || 4 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
#endif
return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
}
template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
static_assert(Rank == -1 || 1 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
#endif
return ptr_[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
static_assert(Rank == -1 || 2 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
#endif
return ptr_[i1 * shape_[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
static_assert(Rank == -1 || 3 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
#endif
return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
static_assert(Rank == -1 || 4 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
#endif
return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return ptr_[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return ptr_[idx];
}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
accessor(Tindex idx) {
static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
ptr_ + stride_[0] * idx, stride_.data() + 1);
}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() {
static_assert(Rank > 0, "rank must higher than zero");
return TensorAccesser<T, Rank, PtrTraits, Tindex>(ptr_, stride_.data());
}
TV_HOST_DEVICE_INLINE
TensorAccesser<T, Rank - 1, PtrTraits, Tindex> accessor(Tindex idx) const {
static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
ptr_ + stride_[0] * idx, stride_.data() + 1);
}
TV_HOST_DEVICE_INLINE
TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() const {
static_assert(Rank > 0, "error");
return TensorAccesser<T, Rank, PtrTraits, Tindex>(
ptr_, stride_.data(), "rank must higher than zero");
}
TV_HOST_DEVICE_INLINE bool empty() const { return ptr_ == nullptr; }
TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
TV_HOST_DEVICE_INLINE const tv_shape_t &shape() const { return shape_; }
TV_HOST_DEVICE_INLINE const tv_shape_t &stride() const { return stride_; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return shape_[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return shape_.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE
TensorView<T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits, Tindex>
view(Inds... newShapes) const {
ShapeBase<Rank == -1 ? TV_MAX_DIM : sizeof...(Inds), Tindex> shapes{
int(newShapes)...};
for (size_t i = 0; i < sizeof...(newShapes); ++i) {
if (shapes[i] == -1) {
shapes[i] = 1;
shapes[i] = size() / shapes.size();
break;
}
}
TV_ASSERT(shapes.size() == size());
return TensorView < T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits,
Tindex > (ptr_, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
view(Shape shapes) const {
TV_ASSERT(shapes.size() == size());
return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex> squeeze() const {
return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shape_.squeeze());
}
TV_HOST_DEVICE_INLINE
TensorView<T, Rank == -1 ? -1 : Rank - 1, PtrTraits, Tindex>
squeeze(int dim) const {
return TensorView < T, Rank == -1 ? -1 : Rank - 1, PtrTraits,
Tindex > (ptr_, shape_.squeeze < Rank == -1 ? TV_MAX_DIM
: Rank - 1 > (dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return shape_.size(); }
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(int id, Integers... ints) {
tv_shape_t start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start),
shape_.subshape(sizeof...(ints) + 1));
}
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(int id, Integers... ints) const {
tv_shape_t start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start),
shape_.subshape(sizeof...(ints) + 1));
}
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
}
template <typename Os> std::string repr(Os &ss) const {
if (empty())
return "";
if (shape_.ndim() == 0) {
ss << "Tensor[" << type_s<T> << "]" << std::endl;
ss << *ptr_;
return ss.str();
}
SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
SimpleVector<int64_t, TV_MAX_DIM> _shape;
for (auto s : shape()) {
_shape.push_back(s);
}
ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
<< ", stride=" << stride() << std::endl;
auto ndimValue = ndim();
for (int64_t i = 0; i < int64_t(size()); ++i) {
rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
bool newline = false;
int end_count = 0;
for (int j = 0; j < ndimValue; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
prev[j] != -1) {
ss << "]";
++end_count;
newline = true;
}
}
if (prev[0] == -1) {
end_count = ndimValue;
}
if (newline) {
ss << "\n";
}
int starts_count = 0;
for (int j = 0; j < ndimValue; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
++starts_count;
}
}
if (starts_count > 0) {
for (int j = 0; j < ndimValue - end_count; ++j) {
ss << " ";
}
for (int j = 0; j < starts_count; ++j) {
ss << "[";
}
}
if (std::is_same<T, uint8_t>::value ||
std::is_same<T, const uint8_t>::value) {
ss << unsigned((*this)[i]);
} else {
ss << (*this)[i];
}
if (nd_index[ndimValue - 1] != _shape[ndimValue - 1] - 1) {
ss << ",";
}
for (int j = 0; j < ndimValue; ++j) {
prev[j] = nd_index[j];
}
}
for (int j = 0; j < ndimValue; ++j) {
ss << "]";
}
return ss.str();
}
std::string repr() const {
std::ostringstream ss;
return repr(ss);
}
protected:
template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1};
}
TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
ptr_t ptr_ = nullptr;
tv_shape_t shape_;
tv_shape_t stride_;
};
template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
return TensorView<T>(arr.data(), {arr.size()});
}
template <typename T>
TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
return TensorView<T>(arr.data(), shape);
}
template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
return TensorView<const T>(arr.data(), {arr.size()});
}
template <typename Os, typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
Os &operator<<(Os &os, const TensorView<T, Rank, PtrTraits, Tindex> &dt) {
os << dt.repr();
return os;
}
template <typename Os, typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
Os &operator<<(Os &os, const TensorView<const T, Rank, PtrTraits, Tindex> &dt) {
os << dt.repr();
return os;
}
namespace detail {
template <typename T> struct TypePrintfFormat;
template <> struct TypePrintfFormat<float> {
static constexpr const char *value = "%.2f";
};
template <> struct TypePrintfFormat<double> {
static constexpr const char *value = "%.2f";
};
template <> struct TypePrintfFormat<int8_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<int16_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<int32_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<uint8_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<uint16_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<uint32_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<int64_t> {
static constexpr const char *value = "%ld";
};
template <> struct TypePrintfFormat<uint64_t> {
static constexpr const char *value = "%lu";
};
template <> struct TypePrintfFormat<bool> {
static constexpr const char *value = "%d";
};
template <typename T>
constexpr const char *type_printf_format_v = TypePrintfFormat<T>::value;
}; // namespace detail
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
TV_HOST_DEVICE void
printTensorView(const TensorView<T, Rank, PtrTraits, Tindex> &tensor,
const char *format) {
// used to print tensor in cuda kernel.
if (tensor.empty())
return;
if (tensor.ndim() == 0) {
printf(format, tensor());
printf("\n");
return;
}
SimpleVector<int64_t, TV_MAX_DIM> prev(tensor.ndim(), -1);
SimpleVector<int64_t, TV_MAX_DIM> nd_index(tensor.ndim());
SimpleVector<int64_t, TV_MAX_DIM> shape(tensor.shape());
auto ndim = tensor.ndim();
for (int64_t i = 0; i < tensor.size(); ++i) {
rowArrayIdxInv(i, nd_index.data(), shape.data(), ndim);
bool newline = false;
int end_count = 0;
for (int j = 0; j < ndim; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
prev[j] != -1) {
printf("]");
++end_count;
newline = true;
}
}
if (prev[0] == -1) {
end_count = ndim;
}
if (newline) {
printf("\n");
}
int starts_count = 0;
for (int j = 0; j < ndim; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
++starts_count;
}
}
if (starts_count > 0) {
for (int j = 0; j < ndim - end_count; ++j) {
printf(" ");
}
for (int j = 0; j < starts_count; ++j) {
printf("]");
}
}
printf(format, tensor[i]);
if (nd_index[ndim - 1] != shape[ndim - 1] - 1) {
printf(",");
}
for (int j = 0; j < ndim; ++j) {
prev[j] = nd_index[j];
}
}
for (int j = 0; j < ndim; ++j) {
printf("]");
}
printf("\n");
}
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
TV_HOST_DEVICE void
printTensorView(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(tensor, detail::type_printf_format_v<Traw>);
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(TensorView<const T>(ptr, shape),
detail::type_printf_format_v<Traw>);
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
const char *format) {
return printTensorView(TensorView<const T>(ptr, shape), format);
}
#ifdef TV_CUDA
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
static_cast<unsigned int>(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)
template <typename T>
void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
checkCudaErrors(
cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
}
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor, cudaStream_t s) {
checkCudaErrors(
cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
}
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_host(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
}
#endif
} // namespace tv
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment