Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
01ed382c
Commit
01ed382c
authored
Oct 18, 2021
by
yan.yan
Browse files
working on tensor core test
parent
3517290c
Changes
159
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
4632 deletions
+0
-4632
include/spconv/point2voxel_ops.h
include/spconv/point2voxel_ops.h
+0
-30
include/spconv/points2voxels.h
include/spconv/points2voxels.h
+0
-22
include/spconv/pool_ops.h
include/spconv/pool_ops.h
+0
-35
include/spconv/reordering.cu.h
include/spconv/reordering.cu.h
+0
-432
include/spconv/reordering.h
include/spconv/reordering.h
+0
-47
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+0
-58
include/spgemm/gemm.h
include/spgemm/gemm.h
+0
-81
include/spgemm/gemm_th.h
include/spgemm/gemm_th.h
+0
-11
include/sphash/hashmap.h
include/sphash/hashmap.h
+0
-11
include/tensorrt/inference.h
include/tensorrt/inference.h
+0
-207
include/tensorview/cc17.h
include/tensorview/cc17.h
+0
-264
include/tensorview/common.h
include/tensorview/common.h
+0
-94
include/tensorview/cuda_utils.h
include/tensorview/cuda_utils.h
+0
-31
include/tensorview/eigen_utils.h
include/tensorview/eigen_utils.h
+0
-41
include/tensorview/kernel_utils.h
include/tensorview/kernel_utils.h
+0
-72
include/tensorview/mp_helper.h
include/tensorview/mp_helper.h
+0
-56
include/tensorview/prettyprint.h
include/tensorview/prettyprint.h
+0
-475
include/tensorview/pybind_utils.h
include/tensorview/pybind_utils.h
+0
-170
include/tensorview/tensor.h
include/tensorview/tensor.h
+0
-992
include/tensorview/tensorview.h
include/tensorview/tensorview.h
+0
-1503
No files found.
include/spconv/point2voxel_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2020 xmyqsh
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <spconv/points2voxels.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
int64_t
pointsToVoxel
(
torch
::
Tensor
points
,
torch
::
Tensor
indexes
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
voxels
,
torch
::
Tensor
coors
,
std
::
vector
<
int64_t
>
gridShape
,
const
int64_t
ndim
);
}
// namespace spconv
include/spconv/points2voxels.h
deleted
100644 → 0
View file @
3517290c
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
void
scatter_point_to_grid_cuda
(
torch
::
Tensor
points
,
torch
::
Tensor
indexes
,
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
pointIndex
,
std
::
vector
<
int64_t
>
gridShape
,
const
int
ndim
);
void
gather_point_from_grid_cuda
(
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
pointIndexUnique
,
torch
::
Tensor
voxels
,
torch
::
Tensor
coors
,
std
::
vector
<
int64_t
>
gridShape
,
const
int
ndim
);
}
// namespace spconv
include/spconv/pool_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <spconv/maxpool.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
);
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/reordering.cu.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <cuda_fp16.h>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
gatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
gatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchGatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
T
(
0
);
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
batchGatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds_elem
;
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
*
numPlanes
+
threadIdx
.
y
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
outFeatures
[
inds
[
ilp
]
+
iy
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
scatterAddVecBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
constexpr
int
vecloadHalf2Factor
=
sizeof
(
VecType
)
/
sizeof
(
__half2
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
T
buf
[
vecloadFactor
];
T
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
];
if
(
std
::
is_same
<
T
,
at
::
Half
>::
value
)
{
#if __CUDA_ARCH__ >= 530
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadHalf2Factor
;
i
++
)
{
reinterpret_cast
<
__half2
*>
(
buf
)[
i
]
=
__hadd2
(
reinterpret_cast
<
__half2
*>
(
buf
)[
i
],
reinterpret_cast
<
__half2
*>
(
buf2
)[
i
]);
}
#else
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
#endif
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
]
=
reinterpret_cast
<
VecType
*>
(
buf
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
outFeatures
[
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
];
}
}
}
#if __CUDA_ARCH__ >= 530
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddHalfBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
idx
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
;
reinterpret_cast
<
__half2
*>
(
outFeatures
)[
idx
]
=
__hadd2
(
reinterpret_cast
<
__half2
*>
(
outFeatures
)[
idx
],
reinterpret_cast
<
__half2
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]);
}
}
}
#endif
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
{
TH_ATOMIC_ADD
(
outFeatures
+
inds
[
ilp
]
*
numPlanes
+
iy
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]);
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds
,
inds_elem
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
TH_ATOMIC_ADD
(
outFeatures
+
inds
*
numPlanes
+
threadIdx
.
y
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]);
}
}
}
}
}
// namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
include/spconv/reordering.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
cudaStream_t
s
,
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cuda
(
cudaStream_t
s
,
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/spconv_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
enum
ConvAlgo
{
kNative
=
0
,
kBatch
,
kBatchGemmGather
,
kSparseConvNet
,
kMinkowskiEngine
};
using
all_conv_algos_t
=
tv
::
mp_list_c
<
int
,
kNative
,
kBatch
,
kBatchGemmGather
,
kSparseConvNet
,
kMinkowskiEngine
>
;
// torch.jit's doc says only support int64, so we need to convert to int32.
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
,
int64_t
_useHash
);
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spgemm/gemm.h
deleted
100644 → 0
View file @
3517290c
#pragma once
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace
spconv
{
template
<
typename
T
>
using
determine_acc_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
cutlass
::
half_t
>::
value
,
float
,
T
>
;
template
<
typename
T
,
bool
TransA
,
bool
TransB
,
bool
TransC
>
cudaError_t
cutlassGemm
(
cudaStream_t
s
,
int
M
,
int
N
,
int
K
,
T
alpha
,
T
const
*
A
,
int
lda
,
T
const
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
{
// Define type definition for single-precision CUTLASS GEMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible
// compositions including the following example for single-precision GEMM.
// Typical values are used as default template arguments. See
// `cutlass/gemm/device/default_gemm_configuration.h` for more details.
//
// To view the full gemm device API interface, see
// `cutlass/gemm/device/gemm.h`
using
TAcc
=
determine_acc_t
<
T
>
;
using
ColumnMajor
=
cutlass
::
layout
::
ColumnMajor
;
using
RowMajor
=
cutlass
::
layout
::
RowMajor
;
using
LayoutA
=
std
::
conditional_t
<
TransA
,
ColumnMajor
,
RowMajor
>
;
using
LayoutB
=
std
::
conditional_t
<
TransB
,
ColumnMajor
,
RowMajor
>
;
using
LayoutC
=
std
::
conditional_t
<
TransC
,
ColumnMajor
,
RowMajor
>
;
using
CutlassGemm
=
cutlass
::
gemm
::
device
::
Gemm
<
T
,
// Data-type of A matrix
LayoutA
,
// Layout of A matrix
T
,
// Data-type of B matrix
LayoutB
,
// Layout of B matrix
T
,
// Data-type of C matrix
LayoutC
,
TAcc
>
;
// Layout of C matrix
// Define a CUTLASS GEMM type
CutlassGemm
gemm_operator
;
// Construct the CUTLASS GEMM arguments object.
//
// One of CUTLASS's design patterns is to define gemm argument objects that
// are constructible in host code and passed to kernels by value. These may
// include pointers, strides, scalars, and other arguments needed by Gemm and
// its components.
//
// The benefits of this pattern are (1.) a structured, composable strategy for
// passing host-constructible arguments to kernels and (2.) minimized
// initialization overhead on kernel entry.
//
typename
CutlassGemm
::
Arguments
args
(
{
M
,
N
,
K
},
// Gemm Problem dimensions
{
A
,
lda
},
// Tensor-ref for source matrix A
{
B
,
ldb
},
// Tensor-ref for source matrix B
{
C
,
ldc
},
// Tensor-ref for source matrix C
{
C
,
ldc
},
// Tensor-ref for destination matrix D (may be different memory
// than source C matrix)
{
alpha
,
beta
});
// Scalars used in the Epilogue
//
// Launch the CUTLASS GEMM kernel.
//
cutlass
::
Status
status
=
gemm_operator
(
args
,
nullptr
,
s
);
//
// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
//
if
(
status
!=
cutlass
::
Status
::
kSuccess
)
{
return
cudaErrorUnknown
;
}
// Return success, if no errors were encountered.
return
cudaSuccess
;
}
}
// namespace spconv
include/spgemm/gemm_th.h
deleted
100644 → 0
View file @
3517290c
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace
spconv
{
void
cutlass_mm_out
(
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
);
void
cutlass_mm_out
(
cudaStream_t
stream
,
torch
::
Tensor
c
,
torch
::
Tensor
a
,
torch
::
Tensor
b
);
}
// namespace spconv
\ No newline at end of file
include/sphash/hashmap.h
deleted
100644 → 0
View file @
3517290c
#include <tensorview/tensor.h>
namespace
spconv
{
enum
HashTypes
{
kDenseMap
=
0
,
kCUDPPHash
=
1
};
template
<
int
Impl
>
struct
HashMap
;
template
<
>
struct
HashMap
<
kDenseMap
>
{};
}
// namespace spconv
\ No newline at end of file
include/tensorrt/inference.h
deleted
100644 → 0
View file @
3517290c
#include "NvInfer.h"
#include <memory>
#include <tensorview/tensor.h>
#include <unordered_map>
#include <vector>
namespace
trt
{
template
<
typename
T
>
tv
::
DType
trt_dtype_to_tv
(
T
trt_dtype
)
{
switch
(
trt_dtype
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
return
tv
::
float32
;
case
nvinfer1
::
DataType
::
kHALF
:
return
tv
::
float16
;
case
nvinfer1
::
DataType
::
kINT32
:
return
tv
::
int32
;
case
nvinfer1
::
DataType
::
kINT8
:
return
tv
::
int8
;
default:
;
}
TV_THROW_INVALID_ARG
(
"unknown trt dtype"
);
}
struct
InferDeleter
{
template
<
typename
T
>
void
operator
()(
T
*
obj
)
const
{
if
(
obj
)
{
obj
->
destroy
();
}
}
};
template
<
typename
T
>
using
trt_unique_ptr_t
=
std
::
unique_ptr
<
T
,
InferDeleter
>
;
class
Logger
:
public
nvinfer1
::
ILogger
{
public:
Logger
(
Severity
severity
=
Severity
::
kWARNING
)
:
reportableSeverity
(
severity
)
{}
void
log
(
Severity
severity
,
const
char
*
msg
)
override
{
// suppress messages with severity enum value greater than the reportable
if
(
severity
>
reportableSeverity
)
return
;
switch
(
severity
)
{
case
Severity
::
kINTERNAL_ERROR
:
std
::
cerr
<<
"INTERNAL_ERROR: "
;
break
;
case
Severity
::
kERROR
:
std
::
cerr
<<
"ERROR: "
;
break
;
case
Severity
::
kWARNING
:
std
::
cerr
<<
"WARNING: "
;
break
;
case
Severity
::
kINFO
:
std
::
cerr
<<
"INFO: "
;
break
;
default:
std
::
cerr
<<
"UNKNOWN: "
;
break
;
}
std
::
cerr
<<
msg
<<
std
::
endl
;
}
Severity
reportableSeverity
;
};
class
InferenceContext
{
public:
explicit
InferenceContext
(
const
std
::
string
&
engine_bin
,
int
device
)
:
logger_
(
nvinfer1
::
ILogger
::
Severity
::
kINFO
),
device_
(
device
)
{
TV_ASSERT_INVALID_ARG
(
device
>=
0
,
"invalid device id"
);
int
deviceCount
;
cudaGetDeviceCount
(
&
deviceCount
);
if
(
device
>=
deviceCount
)
{
TV_THROW_INVALID_ARG
(
"you provide device "
,
device
,
" but you only have "
,
deviceCount
,
" device."
);
}
cudaSetDevice
(
device
);
auto
runtime
=
trt_unique_ptr_t
<
nvinfer1
::
IRuntime
>
(
nvinfer1
::
createInferRuntime
(
logger_
));
engine_
=
trt_unique_ptr_t
<
nvinfer1
::
ICudaEngine
>
(
runtime
->
deserializeCudaEngine
(
engine_bin
.
c_str
(),
engine_bin
.
size
(),
nullptr
));
ctx_
=
trt_unique_ptr_t
<
nvinfer1
::
IExecutionContext
>
(
engine_
->
createExecutionContext
());
max_batch_size_
=
engine_
->
getMaxBatchSize
();
for
(
int
i
=
0
;
i
<
engine_
->
getNbBindings
();
++
i
)
{
auto
dims
=
engine_
->
getBindingDimensions
(
i
);
std
::
vector
<
int
>
shape_vec
(
dims
.
d
,
dims
.
d
+
dims
.
nbDims
);
shape_vec
.
insert
(
shape_vec
.
begin
(),
{
max_batch_size_
});
tv
::
TensorShape
shape
(
shape_vec
);
std
::
string
name
=
engine_
->
getBindingName
(
i
);
auto
trt_dtype
=
engine_
->
getBindingDataType
(
i
);
auto
tv_dtype
=
trt_dtype_to_tv
(
trt_dtype
);
bool
isInput
=
engine_
->
bindingIsInput
(
i
);
name_to_idx_
[
name
]
=
i
;
idx_to_name_
[
i
]
=
name
;
name_to_host_mem_
.
insert
({
name
,
tv
::
Tensor
(
shape
,
tv_dtype
,
-
1
)});
name_to_dev_mem_
.
insert
({
name
,
tv
::
Tensor
(
shape
,
tv_dtype
,
0
)});
if
(
isInput
)
inp_idxes_
.
push_back
(
i
);
else
out_idxes_
.
push_back
(
i
);
bindings_
.
push_back
(
name_to_dev_mem_
[
name
].
raw_data
());
}
checkCudaErrors
(
cudaStreamCreate
(
&
stream_
));
}
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
operator
()(
std
::
vector
<
tv
::
Tensor
>
inputs
)
{
TV_ASSERT_INVALID_ARG
(
inputs
.
size
()
==
inp_idxes_
.
size
(),
"must provide"
,
inp_idxes_
.
size
(),
"inputs, but got"
,
inputs
.
size
());
// inference batch size
int
bs
=
inputs
[
0
].
dim
(
0
);
for
(
auto
&
inp
:
inputs
)
{
TV_ASSERT_INVALID_ARG
(
inp
.
dim
(
0
)
==
bs
,
"batch sizes of all input must same"
);
}
TV_ASSERT_INVALID_ARG
(
bs
<=
max_batch_size_
,
"your batchsize too large"
,
bs
,
max_batch_size_
);
for
(
int
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
&
dev_mem
=
name_to_dev_mem_
[
idx_to_name_
[
i
]];
auto
shape_inp
=
inputs
[
i
].
shape
().
subshape
(
1
);
auto
shape_dev
=
dev_mem
.
shape
().
subshape
(
1
);
TV_ASSERT_INVALID_ARG
(
shape_inp
==
shape_dev
,
"shape except batch must same"
,
shape_inp
,
shape_dev
);
dev_mem
.
slice_first_axis
(
0
,
bs
).
copy_
(
inputs
[
i
].
slice_first_axis
(
0
,
bs
),
stream_
);
}
ctx_
->
enqueue
(
bs
,
bindings_
.
data
(),
stream_
,
nullptr
);
for
(
int
i
:
out_idxes_
)
{
name_to_host_mem_
[
idx_to_name_
[
i
]].
slice_first_axis
(
0
,
bs
).
copy_
(
name_to_dev_mem_
[
idx_to_name_
[
i
]].
slice_first_axis
(
0
,
bs
),
stream_
);
}
checkCudaErrors
(
cudaStreamSynchronize
(
stream_
));
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
output_map
;
for
(
int
i
=
0
;
i
<
out_idxes_
.
size
();
++
i
)
{
auto
name
=
idx_to_name_
[
out_idxes_
[
i
]];
output_map
[
name
]
=
name_to_host_mem_
[
name
].
slice_first_axis
(
0
,
bs
);
}
return
output_map
;
}
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
operator
()(
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
inputs
)
{
std
::
vector
<
tv
::
Tensor
>
inputs_vec
(
inp_idxes_
.
size
());
int
count
=
0
;
for
(
auto
&
p
:
inputs
)
{
auto
iter
=
name_to_idx_
.
find
(
p
.
first
);
TV_ASSERT_INVALID_ARG
(
iter
!=
name_to_idx_
.
end
(),
"cant find your name"
,
p
.
first
);
inputs_vec
[
name_to_idx_
[
p
.
first
]]
=
p
.
second
;
}
TV_ASSERT_INVALID_ARG
(
count
==
inp_idxes_
.
size
(),
"your inp not enough"
);
return
(
*
this
)(
inputs_vec
);
}
tv
::
Tensor
operator
[](
std
::
string
name
)
{
auto
iter
=
name_to_host_mem_
.
find
(
name
);
if
(
iter
==
name_to_host_mem_
.
end
())
{
TV_THROW_INVALID_ARG
(
name
,
"not found."
);
}
return
iter
->
second
;
}
std
::
string
repr
()
{
std
::
stringstream
ss
;
ss
<<
"InferenceContext[gpu="
<<
device_
<<
"]"
;
ss
<<
"
\n
Inputs:"
;
std
::
string
name
;
for
(
auto
&
i
:
inp_idxes_
)
{
name
=
idx_to_name_
[
i
];
auto
&
mem
=
name_to_host_mem_
[
name
];
ss
<<
"
\n
"
<<
name
<<
"["
<<
tv
::
detail
::
typeString
(
mem
.
dtype
())
<<
"]: "
<<
mem
.
shape
();
}
ss
<<
"
\n
Outputs:"
;
for
(
auto
&
i
:
out_idxes_
)
{
name
=
idx_to_name_
[
i
];
auto
&
mem
=
name_to_host_mem_
[
name
];
ss
<<
"
\n
"
<<
name
<<
"["
<<
tv
::
detail
::
typeString
(
mem
.
dtype
())
<<
"]: "
<<
mem
.
shape
();
}
return
ss
.
str
();
}
private:
Logger
logger_
;
trt_unique_ptr_t
<
nvinfer1
::
ICudaEngine
>
engine_
;
trt_unique_ptr_t
<
nvinfer1
::
IExecutionContext
>
ctx_
;
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
name_to_dev_mem_
;
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
name_to_host_mem_
;
std
::
unordered_map
<
std
::
string
,
int
>
name_to_idx_
;
std
::
unordered_map
<
int
,
std
::
string
>
idx_to_name_
;
std
::
vector
<
int
>
inp_idxes_
;
std
::
vector
<
int
>
out_idxes_
;
std
::
vector
<
void
*>
bindings_
;
cudaStream_t
stream_
;
int
max_batch_size_
;
int
device_
;
};
}
// namespace trt
include/tensorview/cc17.h
deleted
100644 → 0
View file @
3517290c
/*
From PyTorch:
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
Samy Bengio, Johnny Mariethoz)
From Caffe2:
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further
mark their specific copyright on a particular contribution, they should
indicate their copyright solely in the commit message of the change when it is
committed.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
America and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <type_traits>
#include <utility>
namespace
tv
{
#ifdef __cpp_lib_void_t
template
<
class
T
>
using
void_t
=
std
::
void_t
<
T
>
;
#else
// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
// (it takes CWG1558 into account and also works for older compilers)
template
<
typename
...
Ts
>
struct
make_void
{
typedef
void
type
;
};
template
<
typename
...
Ts
>
using
void_t
=
typename
make_void
<
Ts
...
>::
type
;
#endif
namespace
detail
{
struct
_identity
final
{
template
<
class
T
>
using
type_identity
=
T
;
template
<
class
T
>
decltype
(
auto
)
operator
()(
T
&&
arg
)
{
return
std
::
forward
<
T
>
(
arg
);
}
};
template
<
class
Func
,
class
Enable
=
void
>
struct
function_takes_identity_argument
:
std
::
false_type
{};
#if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t
// instead of std::void_t. But we're only building on MSVC versions that have
// std::void_t, so let's just use that one.
template
<
class
Func
>
struct
function_takes_identity_argument
<
Func
,
std
::
void_t
<
decltype
(
std
::
declval
<
Func
>
()(
_identity
()))
>>
:
std
::
true_type
{};
#else
template
<
class
Func
>
struct
function_takes_identity_argument
<
Func
,
void_t
<
decltype
(
std
::
declval
<
Func
>
()(
_identity
()))
>>
:
std
::
true_type
{};
#endif
template
<
bool
Condition
>
struct
_if_constexpr
;
template
<
>
struct
_if_constexpr
<
true
>
final
{
template
<
class
ThenCallback
,
class
ElseCallback
,
std
::
enable_if_t
<
function_takes_identity_argument
<
ThenCallback
>
::
value
,
void
*>
=
nullptr
>
static
decltype
(
auto
)
call
(
ThenCallback
&&
thenCallback
,
ElseCallback
&&
/* elseCallback */
)
{
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return
thenCallback
(
_identity
());
}
template
<
class
ThenCallback
,
class
ElseCallback
,
std
::
enable_if_t
<!
function_takes_identity_argument
<
ThenCallback
>
::
value
,
void
*>
=
nullptr
>
static
decltype
(
auto
)
call
(
ThenCallback
&&
thenCallback
,
ElseCallback
&&
/* elseCallback */
)
{
return
thenCallback
();
}
};
template
<
>
struct
_if_constexpr
<
false
>
final
{
template
<
class
ThenCallback
,
class
ElseCallback
,
std
::
enable_if_t
<
function_takes_identity_argument
<
ElseCallback
>
::
value
,
void
*>
=
nullptr
>
static
decltype
(
auto
)
call
(
ThenCallback
&&
/* thenCallback */
,
ElseCallback
&&
elseCallback
)
{
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return
elseCallback
(
_identity
());
}
template
<
class
ThenCallback
,
class
ElseCallback
,
std
::
enable_if_t
<!
function_takes_identity_argument
<
ElseCallback
>
::
value
,
void
*>
=
nullptr
>
static
decltype
(
auto
)
call
(
ThenCallback
&&
/* thenCallback */
,
ElseCallback
&&
elseCallback
)
{
return
elseCallback
();
}
};
}
// namespace detail
/*
* Get something like C++17 if constexpr in C++14.
*
* Example 1: simple constexpr if/then/else
* template<int arg> int increment_absolute_value() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* [&] { ++result; } // then-case
* [&] { --result; } // else-case
* );
* return result;
* }
*
* Example 2: without else case (i.e. conditionally prune code from assembly)
* template<int arg> int decrement_if_positive() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* // This decrement operation is only present in the assembly for
* // template instances with arg > 0.
* [&] { --result; }
* );
* return result;
* }
*
* Example 3: branch based on type (i.e. replacement for SFINAE)
* struct MyClass1 {int value;};
* struct MyClass2 {int val};
* template <class T>
* int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T ==
* MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T ==
* MyClass1
* );
* }
*
* Note: The _ argument passed in Example 3 is the identity function, i.e. it
* does nothing. It is used to force the compiler to delay type checking,
* because the compiler doesn't know what kind of _ is passed in. Without it,
* the compiler would fail when you try to access t.value but the member doesn't
* exist.
*
* Note: In Example 3, both branches return int, so func() returns int. This is
* not necessary. If func() had a return type of "auto", then both branches
* could return different types, say func<MyClass1>() could return int and
* func<MyClass2>() could return string.
*/
template
<
bool
Condition
,
class
ThenCallback
,
class
ElseCallback
>
decltype
(
auto
)
if_constexpr
(
ThenCallback
&&
thenCallback
,
ElseCallback
&&
elseCallback
)
{
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if
constexpr
(
Condition
)
{
if
constexpr
(
detail
::
function_takes_identity_argument
<
ThenCallback
>::
value
)
{
return
std
::
forward
<
ThenCallback
>
(
thenCallback
)(
detail
::
_identity
());
}
else
{
return
std
::
forward
<
ThenCallback
>
(
thenCallback
)();
}
}
else
{
if
constexpr
(
detail
::
function_takes_identity_argument
<
ElseCallback
>::
value
)
{
return
std
::
forward
<
ElseCallback
>
(
elseCallback
)(
detail
::
_identity
());
}
else
{
return
std
::
forward
<
ElseCallback
>
(
elseCallback
)();
}
}
#else
// C++14 implementation of if constexpr
return
detail
::
_if_constexpr
<
Condition
>::
call
(
std
::
forward
<
ThenCallback
>
(
thenCallback
),
std
::
forward
<
ElseCallback
>
(
elseCallback
));
#endif
}
template
<
bool
Condition
,
class
ThenCallback
>
decltype
(
auto
)
if_constexpr
(
ThenCallback
&&
thenCallback
)
{
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if
constexpr
(
Condition
)
{
if
constexpr
(
detail
::
function_takes_identity_argument
<
ThenCallback
>::
value
)
{
return
std
::
forward
<
ThenCallback
>
(
thenCallback
)(
detail
::
_identity
());
}
else
{
return
std
::
forward
<
ThenCallback
>
(
thenCallback
)();
}
}
#else
// C++14 implementation of if constexpr
return
if_constexpr
<
Condition
>
(
std
::
forward
<
ThenCallback
>
(
thenCallback
),
[](
auto
)
{});
#endif
}
}
// namespace tv
include/tensorview/common.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <sstream>
#ifdef TV_USE_STACKTRACE
#if defined(WIN32) || defined(_WIN32) || \
defined(__WIN32) && !defined(__CYGWIN__)
#define BOOST_STACKTRACE_USE_WINDBG
#else
// require linking with -ldl and -lbacktrace in linux
#define BOOST_STACKTRACE_USE_BACKTRACE
#endif
#include <boost/stacktrace.hpp>
#endif
namespace
tv
{
template
<
class
SStream
,
class
T
>
void
sstream_print
(
SStream
&
ss
,
T
val
)
{
ss
<<
val
;
}
template
<
class
SStream
,
class
T
,
class
...
TArgs
>
void
sstream_print
(
SStream
&
ss
,
T
val
,
TArgs
...
args
)
{
ss
<<
val
<<
" "
;
sstream_print
(
ss
,
args
...);
}
template
<
class
...
TArgs
>
void
ssprint
(
TArgs
...
args
)
{
std
::
stringstream
ss
;
sstream_print
(
ss
,
args
...);
std
::
cout
<<
ss
.
str
()
<<
std
::
endl
;
}
#ifdef TV_USE_STACKTRACE
#define TV_BACKTRACE_PRINT(ss) \
ss << std::endl << boost::stacktrace::stacktrace();
#else
#define TV_BACKTRACE_PRINT(ss)
#endif
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_THROW_INVALID_ARG(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
}
// namespace tv
\ No newline at end of file
include/tensorview/cuda_utils.h
deleted
100644 → 0
View file @
3517290c
#pragma once
// from pytorch.aten
#include "tensorview.h"
#include <type_traits>
namespace
tv
{
namespace
cuda
{
template
<
typename
T1
,
typename
T2
>
inline
int
DivUp
(
const
T1
a
,
const
T2
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr
int
CUDA_NUM_THREADS
=
1024
;
// CUDA: number of blocks for threads.
inline
int
getNumThreads
(
const
int
N
)
{
if
(
N
>
CUDA_NUM_THREADS
)
{
return
CUDA_NUM_THREADS
;
}
return
DivUp
(
N
,
32
)
*
32
;
}
inline
int
getBlocks
(
const
int
N
)
{
TV_ASSERT_RT_ERR
(
N
>
0
,
"CUDA kernel launch blocks must be positive, but got N="
,
N
);
return
DivUp
(
N
,
getNumThreads
(
N
));
}
}
// namespace cuda
}
// namespace tv
\ No newline at end of file
include/tensorview/eigen_utils.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <eigen3/Eigen/Dense>
namespace
tv
{
template
<
typename
T
,
int
Row
=
Eigen
::
Dynamic
,
int
Col
=
Eigen
::
Dynamic
>
Eigen
::
Map
<
Eigen
::
Matrix
<
T
,
Row
,
Col
,
Eigen
::
RowMajor
>>
tv2eigen
(
TensorView
<
T
>
view
)
{
TV_ASSERT_INVALID_ARG
(
view
.
ndim
()
<=
2
&&
view
.
ndim
()
>
0
,
"error"
);
if
(
Row
!=
Eigen
::
Dynamic
)
{
TV_ASSERT_INVALID_ARG
(
view
.
dim
(
0
)
==
Row
,
"error"
);
}
if
(
Col
!=
Eigen
::
Dynamic
)
{
TV_ASSERT_INVALID_ARG
(
view
.
dim
(
1
)
==
Col
,
"error"
);
}
int
row
=
1
;
if
(
view
.
ndim
()
==
2
)
{
row
=
view
.
dim
(
0
);
}
Eigen
::
Map
<
Eigen
::
Matrix
<
T
,
Row
,
Col
,
Eigen
::
RowMajor
>>
eigen_map
(
view
.
data
(),
row
,
view
.
dim
(
1
));
return
eigen_map
;
}
}
// namespace tv
include/tensorview/kernel_utils.h
deleted
100644 → 0
View file @
3517290c
#pragma once
// from tensorflow
namespace
tv
{
namespace
detail
{
template
<
typename
T
>
class
KernelLoop
{
struct
Iterator
{
__forceinline__
__device__
Iterator
(
T
index
,
T
delta
)
:
index_
(
index
),
delta_
(
delta
)
{}
__forceinline__
__device__
T
operator
*
()
const
{
return
index_
;
}
__forceinline__
__device__
Iterator
&
operator
++
()
{
index_
+=
delta_
;
return
*
this
;
}
__forceinline__
__device__
bool
operator
!=
(
const
Iterator
&
other
)
const
{
bool
greater
=
index_
>
other
.
index_
;
bool
less
=
index_
<
other
.
index_
;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if
(
!
other
.
delta_
)
{
return
less
;
}
if
(
!
delta_
)
{
return
greater
;
}
return
less
||
greater
;
}
private:
T
index_
;
const
T
delta_
;
};
public:
__forceinline__
__device__
KernelLoop
(
T
begin
,
T
delta
,
T
end
)
:
begin_
(
begin
),
delta_
(
delta
),
end_
(
end
)
{}
__forceinline__
__device__
Iterator
begin
()
const
{
return
Iterator
{
begin_
,
delta_
};
}
__forceinline__
__device__
Iterator
end
()
const
{
return
Iterator
{
end_
,
0
};
}
private:
T
begin_
;
T
delta_
;
T
end_
;
};
}
// namespace detail
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopX
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
,
gridDim
.
x
*
blockDim
.
x
*
NumILP
,
count
);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopY
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
,
gridDim
.
y
*
blockDim
.
y
*
NumILP
,
count
);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopZ
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
,
gridDim
.
z
*
blockDim
.
z
*
NumILP
,
count
);
}
}
// namespace tv
\ No newline at end of file
include/tensorview/mp_helper.h
deleted
100644 → 0
View file @
3517290c
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>
namespace
tv
{
template
<
class
...
T
>
struct
mp_list
{};
template
<
class
T
,
T
...
I
>
using
mp_list_c
=
mp_list
<
std
::
integral_constant
<
T
,
I
>
...
>
;
template
<
int
...
I
>
using
mp_list_int_c
=
mp_list
<
std
::
integral_constant
<
int
,
I
>
...
>
;
namespace
detail
{
template
<
class
...
Ts
,
class
F
>
constexpr
F
mp_for_each_impl
(
mp_list
<
Ts
...
>
,
F
&&
f
)
{
return
(
void
)(
std
::
initializer_list
<
int
>
{(
f
(
Ts
()),
0
)...}),
std
::
forward
<
F
>
(
f
);
}
template
<
class
F
>
constexpr
F
mp_for_each_impl
(
mp_list
<>
,
F
&&
f
)
{
return
std
::
forward
<
F
>
(
f
);
}
}
// namespace detail
template
<
class
...
T
>
using
mp_length
=
std
::
integral_constant
<
std
::
size_t
,
sizeof
...(
T
)
>
;
namespace
detail
{
template
<
class
A
,
template
<
class
...
>
class
B
>
struct
mp_rename_impl
{
// An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list
};
template
<
template
<
class
...
>
class
A
,
class
...
T
,
template
<
class
...
>
class
B
>
struct
mp_rename_impl
<
A
<
T
...
>
,
B
>
{
using
type
=
B
<
T
...
>
;
};
}
// namespace detail
template
<
class
A
,
template
<
class
...
>
class
B
>
using
mp_rename
=
typename
detail
::
mp_rename_impl
<
A
,
B
>::
type
;
template
<
class
L
>
using
mp_size
=
mp_rename
<
L
,
mp_length
>
;
template
<
class
L
,
class
F
>
constexpr
F
mp_for_each
(
F
&&
f
)
{
return
detail
::
mp_for_each_impl
(
mp_rename
<
L
,
mp_list
>
(),
std
::
forward
<
F
>
(
f
));
}
}
// namespace tv
#endif
\ No newline at end of file
include/tensorview/prettyprint.h
deleted
100644 → 0
View file @
3517290c
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace
pretty_print
{
namespace
detail
{
// SFINAE type trait to detect whether T::const_iterator exists.
struct
sfinae_base
{
using
yes
=
char
;
using
no
=
yes
[
2
];
};
template
<
typename
T
>
struct
has_const_iterator
:
private
sfinae_base
{
private:
template
<
typename
C
>
static
yes
&
test
(
typename
C
::
const_iterator
*
);
template
<
typename
C
>
static
no
&
test
(...);
public:
static
const
bool
value
=
sizeof
(
test
<
T
>
(
nullptr
))
==
sizeof
(
yes
);
using
type
=
T
;
};
template
<
typename
T
>
struct
has_begin_end
:
private
sfinae_base
{
private:
template
<
typename
C
>
static
yes
&
f
(
typename
std
::
enable_if
<
std
::
is_same
<
decltype
(
static_cast
<
typename
C
::
const_iterator
(
C
::*
)()
const
>
(
&
C
::
begin
)),
typename
C
::
const_iterator
(
C
::*
)()
const
>::
value
>::
type
*
);
template
<
typename
C
>
static
no
&
f
(...);
template
<
typename
C
>
static
yes
&
g
(
typename
std
::
enable_if
<
std
::
is_same
<
decltype
(
static_cast
<
typename
C
::
const_iterator
(
C
::*
)()
const
>
(
&
C
::
end
)),
typename
C
::
const_iterator
(
C
::*
)()
const
>::
value
,
void
>::
type
*
);
template
<
typename
C
>
static
no
&
g
(...);
public:
static
bool
const
beg_value
=
sizeof
(
f
<
T
>
(
nullptr
))
==
sizeof
(
yes
);
static
bool
const
end_value
=
sizeof
(
g
<
T
>
(
nullptr
))
==
sizeof
(
yes
);
};
}
// namespace detail
// Holds the delimiter values for a specific character type
template
<
typename
TChar
>
struct
delimiters_values
{
using
char_type
=
TChar
;
const
char_type
*
prefix
;
const
char_type
*
delimiter
;
const
char_type
*
postfix
;
};
// Defines the delimiter values for a specific container and character type
template
<
typename
T
,
typename
TChar
>
struct
delimiters
{
using
type
=
delimiters_values
<
TChar
>
;
static
const
type
values
;
};
// Functor to print containers. You can use this directly if you want
// to specificy a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template
<
typename
T
,
typename
TChar
=
char
,
typename
TCharTraits
=
::
std
::
char_traits
<
TChar
>,
typename
TDelimiters
=
delimiters
<
T
,
TChar
>>
struct
print_container_helper
{
using
delimiters_type
=
TDelimiters
;
using
ostream_type
=
std
::
basic_ostream
<
TChar
,
TCharTraits
>
;
template
<
typename
U
>
struct
printer
{
static
void
print_body
(
const
U
&
c
,
ostream_type
&
stream
)
{
using
std
::
begin
;
using
std
::
end
;
auto
it
=
begin
(
c
);
const
auto
the_end
=
end
(
c
);
if
(
it
!=
the_end
)
{
for
(;;)
{
stream
<<
*
it
;
if
(
++
it
==
the_end
)
break
;
if
(
delimiters_type
::
values
.
delimiter
!=
NULL
)
stream
<<
delimiters_type
::
values
.
delimiter
;
}
}
}
};
print_container_helper
(
const
T
&
container
)
:
container_
(
container
)
{}
inline
void
operator
()(
ostream_type
&
stream
)
const
{
if
(
delimiters_type
::
values
.
prefix
!=
NULL
)
stream
<<
delimiters_type
::
values
.
prefix
;
printer
<
T
>::
print_body
(
container_
,
stream
);
if
(
delimiters_type
::
values
.
postfix
!=
NULL
)
stream
<<
delimiters_type
::
values
.
postfix
;
}
private:
const
T
&
container_
;
};
// Specialization for pairs
template
<
typename
T
,
typename
TChar
,
typename
TCharTraits
,
typename
TDelimiters
>
template
<
typename
T1
,
typename
T2
>
struct
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
printer
<
std
::
pair
<
T1
,
T2
>>
{
using
ostream_type
=
typename
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
ostream_type
;
static
void
print_body
(
const
std
::
pair
<
T1
,
T2
>
&
c
,
ostream_type
&
stream
)
{
stream
<<
c
.
first
;
if
(
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
delimiters_type
::
values
.
delimiter
!=
NULL
)
stream
<<
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
delimiters_type
::
values
.
delimiter
;
stream
<<
c
.
second
;
}
};
// Specialization for tuples
template
<
typename
T
,
typename
TChar
,
typename
TCharTraits
,
typename
TDelimiters
>
template
<
typename
...
Args
>
struct
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
printer
<
std
::
tuple
<
Args
...
>>
{
using
ostream_type
=
typename
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
ostream_type
;
using
element_type
=
std
::
tuple
<
Args
...
>
;
template
<
std
::
size_t
I
>
struct
Int
{};
static
void
print_body
(
const
element_type
&
c
,
ostream_type
&
stream
)
{
tuple_print
(
c
,
stream
,
Int
<
0
>
());
}
static
void
tuple_print
(
const
element_type
&
,
ostream_type
&
,
Int
<
sizeof
...(
Args
)
>
)
{}
static
void
tuple_print
(
const
element_type
&
c
,
ostream_type
&
stream
,
typename
std
::
conditional
<
sizeof
...(
Args
)
!=
0
,
Int
<
0
>
,
std
::
nullptr_t
>::
type
)
{
stream
<<
std
::
get
<
0
>
(
c
);
tuple_print
(
c
,
stream
,
Int
<
1
>
());
}
template
<
std
::
size_t
N
>
static
void
tuple_print
(
const
element_type
&
c
,
ostream_type
&
stream
,
Int
<
N
>
)
{
if
(
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
delimiters_type
::
values
.
delimiter
!=
NULL
)
stream
<<
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>::
delimiters_type
::
values
.
delimiter
;
stream
<<
std
::
get
<
N
>
(
c
);
tuple_print
(
c
,
stream
,
Int
<
N
+
1
>
());
}
};
// Prints a print_container_helper to the specified stream.
template
<
typename
T
,
typename
TChar
,
typename
TCharTraits
,
typename
TDelimiters
>
inline
std
::
basic_ostream
<
TChar
,
TCharTraits
>
&
operator
<<
(
std
::
basic_ostream
<
TChar
,
TCharTraits
>
&
stream
,
const
print_container_helper
<
T
,
TChar
,
TCharTraits
,
TDelimiters
>
&
helper
)
{
helper
(
stream
);
return
stream
;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template
<
typename
T
>
struct
is_container
:
public
std
::
integral_constant
<
bool
,
detail
::
has_const_iterator
<
T
>::
value
&&
detail
::
has_begin_end
<
T
>::
beg_value
&&
detail
::
has_begin_end
<
T
>::
end_value
>
{};
template
<
typename
T
,
std
::
size_t
N
>
struct
is_container
<
T
[
N
]
>
:
std
::
true_type
{};
template
<
std
::
size_t
N
>
struct
is_container
<
char
[
N
]
>
:
std
::
false_type
{};
template
<
typename
T
>
struct
is_container
<
std
::
valarray
<
T
>>
:
std
::
true_type
{};
template
<
typename
T1
,
typename
T2
>
struct
is_container
<
std
::
pair
<
T1
,
T2
>>
:
std
::
true_type
{};
template
<
typename
...
Args
>
struct
is_container
<
std
::
tuple
<
Args
...
>>
:
std
::
true_type
{};
// Default delimiters
template
<
typename
T
>
struct
delimiters
<
T
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T
>
const
delimiters_values
<
char
>
delimiters
<
T
,
char
>::
values
=
{
"["
,
", "
,
"]"
};
template
<
typename
T
>
struct
delimiters
<
T
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T
>
const
delimiters_values
<
wchar_t
>
delimiters
<
T
,
wchar_t
>::
values
=
{
L"["
,
L", "
,
L"]"
};
// Delimiters for (multi)set and unordered_(multi)set
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
struct
delimiters
<::
std
::
set
<
T
,
TComp
,
TAllocator
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
const
delimiters_values
<
char
>
delimiters
<::
std
::
set
<
T
,
TComp
,
TAllocator
>
,
char
>::
values
=
{
"{"
,
", "
,
"}"
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
struct
delimiters
<::
std
::
set
<
T
,
TComp
,
TAllocator
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
const
delimiters_values
<
wchar_t
>
delimiters
<::
std
::
set
<
T
,
TComp
,
TAllocator
>
,
wchar_t
>::
values
=
{
L"{"
,
L", "
,
L"}"
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
struct
delimiters
<::
std
::
multiset
<
T
,
TComp
,
TAllocator
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
const
delimiters_values
<
char
>
delimiters
<::
std
::
multiset
<
T
,
TComp
,
TAllocator
>
,
char
>::
values
=
{
"{"
,
", "
,
"}"
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
struct
delimiters
<::
std
::
multiset
<
T
,
TComp
,
TAllocator
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T
,
typename
TComp
,
typename
TAllocator
>
const
delimiters_values
<
wchar_t
>
delimiters
<::
std
::
multiset
<
T
,
TComp
,
TAllocator
>
,
wchar_t
>::
values
=
{
L"{"
,
L", "
,
L"}"
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
struct
delimiters
<::
std
::
unordered_set
<
T
,
THash
,
TEqual
,
TAllocator
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
const
delimiters_values
<
char
>
delimiters
<
::
std
::
unordered_set
<
T
,
THash
,
TEqual
,
TAllocator
>
,
char
>::
values
=
{
"{"
,
", "
,
"}"
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
struct
delimiters
<::
std
::
unordered_set
<
T
,
THash
,
TEqual
,
TAllocator
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
const
delimiters_values
<
wchar_t
>
delimiters
<
::
std
::
unordered_set
<
T
,
THash
,
TEqual
,
TAllocator
>
,
wchar_t
>::
values
=
{
L"{"
,
L", "
,
L"}"
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
struct
delimiters
<::
std
::
unordered_multiset
<
T
,
THash
,
TEqual
,
TAllocator
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
const
delimiters_values
<
char
>
delimiters
<
::
std
::
unordered_multiset
<
T
,
THash
,
TEqual
,
TAllocator
>
,
char
>::
values
=
{
"{"
,
", "
,
"}"
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
struct
delimiters
<::
std
::
unordered_multiset
<
T
,
THash
,
TEqual
,
TAllocator
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T
,
typename
THash
,
typename
TEqual
,
typename
TAllocator
>
const
delimiters_values
<
wchar_t
>
delimiters
<::
std
::
unordered_multiset
<
T
,
THash
,
TEqual
,
TAllocator
>
,
wchar_t
>::
values
=
{
L"{"
,
L", "
,
L"}"
};
// Delimiters for pair and tuple
template
<
typename
T1
,
typename
T2
>
struct
delimiters
<
std
::
pair
<
T1
,
T2
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
T1
,
typename
T2
>
const
delimiters_values
<
char
>
delimiters
<
std
::
pair
<
T1
,
T2
>
,
char
>::
values
=
{
"("
,
", "
,
")"
};
template
<
typename
T1
,
typename
T2
>
struct
delimiters
<::
std
::
pair
<
T1
,
T2
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
T1
,
typename
T2
>
const
delimiters_values
<
wchar_t
>
delimiters
<::
std
::
pair
<
T1
,
T2
>
,
wchar_t
>::
values
=
{
L"("
,
L", "
,
L")"
};
template
<
typename
...
Args
>
struct
delimiters
<
std
::
tuple
<
Args
...
>
,
char
>
{
static
const
delimiters_values
<
char
>
values
;
};
template
<
typename
...
Args
>
const
delimiters_values
<
char
>
delimiters
<
std
::
tuple
<
Args
...
>
,
char
>::
values
=
{
"("
,
", "
,
")"
};
template
<
typename
...
Args
>
struct
delimiters
<::
std
::
tuple
<
Args
...
>
,
wchar_t
>
{
static
const
delimiters_values
<
wchar_t
>
values
;
};
template
<
typename
...
Args
>
const
delimiters_values
<
wchar_t
>
delimiters
<::
std
::
tuple
<
Args
...
>
,
wchar_t
>::
values
=
{
L"("
,
L", "
,
L")"
};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct
custom_delims_base
{
virtual
~
custom_delims_base
()
{}
virtual
std
::
ostream
&
stream
(
::
std
::
ostream
&
)
=
0
;
virtual
std
::
wostream
&
stream
(
::
std
::
wostream
&
)
=
0
;
};
template
<
typename
T
,
typename
Delims
>
struct
custom_delims_wrapper
:
custom_delims_base
{
custom_delims_wrapper
(
const
T
&
t_
)
:
t
(
t_
)
{}
std
::
ostream
&
stream
(
std
::
ostream
&
s
)
{
return
s
<<
print_container_helper
<
T
,
char
,
std
::
char_traits
<
char
>
,
Delims
>
(
t
);
}
std
::
wostream
&
stream
(
std
::
wostream
&
s
)
{
return
s
<<
print_container_helper
<
T
,
wchar_t
,
std
::
char_traits
<
wchar_t
>
,
Delims
>
(
t
);
}
private:
const
T
&
t
;
};
template
<
typename
Delims
>
struct
custom_delims
{
template
<
typename
Container
>
custom_delims
(
const
Container
&
c
)
:
base
(
new
custom_delims_wrapper
<
Container
,
Delims
>
(
c
))
{}
std
::
unique_ptr
<
custom_delims_base
>
base
;
};
template
<
typename
TChar
,
typename
TCharTraits
,
typename
Delims
>
inline
std
::
basic_ostream
<
TChar
,
TCharTraits
>
&
operator
<<
(
std
::
basic_ostream
<
TChar
,
TCharTraits
>
&
s
,
const
custom_delims
<
Delims
>
&
p
)
{
return
p
.
base
->
stream
(
s
);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template
<
typename
T
>
struct
array_wrapper_n
{
typedef
const
T
*
const_iterator
;
typedef
T
value_type
;
array_wrapper_n
(
const
T
*
const
a
,
size_t
n
)
:
_array
(
a
),
_n
(
n
)
{}
inline
const_iterator
begin
()
const
{
return
_array
;
}
inline
const_iterator
end
()
const
{
return
_array
+
_n
;
}
private:
const
T
*
const
_array
;
size_t
_n
;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template
<
typename
T
>
struct
bucket_print_wrapper
{
typedef
typename
T
::
const_local_iterator
const_iterator
;
typedef
typename
T
::
size_type
size_type
;
const_iterator
begin
()
const
{
return
m_map
.
cbegin
(
n
);
}
const_iterator
end
()
const
{
return
m_map
.
cend
(
n
);
}
bucket_print_wrapper
(
const
T
&
m
,
size_type
bucket
)
:
m_map
(
m
),
n
(
bucket
)
{}
private:
const
T
&
m_map
;
const
size_type
n
;
};
}
// namespace pretty_print
// Global accessor functions for the convenience wrappers
template
<
typename
T
>
inline
pretty_print
::
array_wrapper_n
<
T
>
pretty_print_array
(
const
T
*
const
a
,
size_t
n
)
{
return
pretty_print
::
array_wrapper_n
<
T
>
(
a
,
n
);
}
template
<
typename
T
>
pretty_print
::
bucket_print_wrapper
<
T
>
bucket_print
(
const
T
&
m
,
typename
T
::
size_type
n
)
{
return
pretty_print
::
bucket_print_wrapper
<
T
>
(
m
,
n
);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace
std
{
// Prints a container to the stream using default delimiters
template
<
typename
T
,
typename
TChar
,
typename
TCharTraits
>
inline
typename
enable_if
<::
pretty_print
::
is_container
<
T
>::
value
,
basic_ostream
<
TChar
,
TCharTraits
>
&>::
type
operator
<<
(
basic_ostream
<
TChar
,
TCharTraits
>
&
stream
,
const
T
&
container
)
{
return
stream
<<
::
pretty_print
::
print_container_helper
<
T
,
TChar
,
TCharTraits
>
(
container
);
}
}
// namespace std
#endif // H_PRETTY_PRINT
include/tensorview/pybind_utils.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <algorithm>
#include <array>
#include <iostream>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
namespace
py
=
pybind11
;
namespace
tv
{
template
<
typename
Tarr
>
bool
is_c_style
(
const
Tarr
&
arr
)
{
return
bool
(
arr
.
flags
()
&
py
::
array
::
c_style
);
}
template
<
typename
T
,
int
Rank
=
-
1
>
TensorView
<
T
,
Rank
>
arrayt2tv
(
py
::
array_t
<
T
>
arr
)
{
TV_ASSERT_INVALID_ARG
(
is_c_style
(
arr
),
"array must be c-contiguous array"
);
Shape
shape
;
for
(
int
i
=
0
;
i
<
arr
.
ndim
();
++
i
)
{
shape
.
push_back
(
arr
.
shape
(
i
));
}
if
(
Rank
>=
0
)
{
TV_ASSERT_INVALID_ARG
(
shape
.
ndim
()
==
Rank
,
"error"
);
}
return
TensorView
<
T
,
Rank
>
(
arr
.
mutable_data
(),
shape
);
}
template
<
typename
T
,
int
Rank
=
-
1
>
TensorView
<
const
T
>
carrayt2tv
(
py
::
array_t
<
T
>
arr
)
{
TV_ASSERT_INVALID_ARG
(
is_c_style
(
arr
),
"array must be c-contiguous array"
);
Shape
shape
;
for
(
int
i
=
0
;
i
<
arr
.
ndim
();
++
i
)
{
shape
.
push_back
(
arr
.
shape
(
i
));
}
if
(
Rank
>=
0
)
{
TV_ASSERT_INVALID_ARG
(
shape
.
ndim
()
==
Rank
,
"error"
);
}
return
TensorView
<
const
T
,
Rank
>
(
arr
.
data
(),
shape
);
}
template
<
typename
Tarr
>
tv
::
DType
get_array_tv_dtype
(
const
Tarr
&
arr
)
{
switch
(
arr
.
dtype
().
kind
())
{
case
'b'
:
return
tv
::
bool_
;
case
'i'
:
{
switch
(
arr
.
itemsize
())
{
case
1
:
return
tv
::
int8
;
case
2
:
return
tv
::
int16
;
case
4
:
return
tv
::
int32
;
case
8
:
return
tv
::
int64
;
default:
break
;
}
}
case
'u'
:
{
switch
(
arr
.
itemsize
())
{
case
1
:
return
tv
::
uint8
;
case
2
:
return
tv
::
uint16
;
case
4
:
return
tv
::
uint32
;
case
8
:
return
tv
::
uint64
;
default:
break
;
}
}
case
'f'
:
{
switch
(
arr
.
itemsize
())
{
case
2
:
return
tv
::
float16
;
case
4
:
return
tv
::
float32
;
case
8
:
return
tv
::
float64
;
default:
break
;
}
}
}
TV_THROW_RT_ERR
(
"unknown dtype"
,
arr
.
dtype
().
kind
(),
arr
.
itemsize
());
}
template
<
typename
Tarr
>
Tensor
array2tensor
(
Tarr
&
arr
)
{
TV_ASSERT_INVALID_ARG
(
is_c_style
(
arr
),
"array must be c-contiguous array"
);
TensorShape
shape
;
for
(
int
i
=
0
;
i
<
arr
.
ndim
();
++
i
)
{
shape
.
push_back
(
arr
.
shape
(
i
));
}
return
tv
::
from_blob
(
arr
.
mutable_data
(),
shape
,
get_array_tv_dtype
(
arr
),
-
1
);
}
template
<
typename
T
>
Tensor
arrayt2tensor
(
py
::
array_t
<
T
>
&
arr
)
{
TV_ASSERT_INVALID_ARG
(
is_c_style
(
arr
),
"array must be c-contiguous array"
);
TensorShape
shape
;
for
(
int
i
=
0
;
i
<
arr
.
ndim
();
++
i
)
{
shape
.
push_back
(
arr
.
shape
(
i
));
}
return
tv
::
from_blob
(
arr
.
mutable_data
(),
shape
,
tv
::
type_v
<
T
>
,
-
1
);
}
template
<
typename
TDType
>
py
::
dtype
tv_dtype_to_py
(
TDType
d
)
{
switch
(
d
)
{
case
float32
:
return
py
::
dtype
(
"float32"
);
case
float64
:
return
py
::
dtype
(
"float64"
);
case
float16
:
return
py
::
dtype
(
"float16"
);
case
int32
:
return
py
::
dtype
(
"int32"
);
case
int16
:
return
py
::
dtype
(
"int16"
);
case
int8
:
return
py
::
dtype
(
"int8"
);
case
int64
:
return
py
::
dtype
(
"int64"
);
case
uint32
:
return
py
::
dtype
(
"uint32"
);
case
uint16
:
return
py
::
dtype
(
"uint16"
);
case
uint8
:
return
py
::
dtype
(
"uint8"
);
case
uint64
:
return
py
::
dtype
(
"uint64"
);
case
bool_
:
return
py
::
dtype
(
"bool_"
);
default:
;
}
TV_THROW_INVALID_ARG
(
"unknown dtype"
,
d
);
}
// add template to define function in header
template
<
typename
Ttensor
>
py
::
array
tensor2array
(
Ttensor
&
tensor
)
{
// you cant call this function during GIL released.
TV_ASSERT_INVALID_ARG
(
tensor
.
device
()
==
-
1
,
"must be cpu tensor"
);
auto
shape
=
tensor
.
shape
();
std
::
vector
<
int
>
shape_vec
(
shape
.
begin
(),
shape
.
end
());
auto
dtype
=
tv_dtype_to_py
(
tensor
.
dtype
());
// construct py::array will copy content from ptr.
// its expected because we can't transfer ownership from
// c++ tv::Tensor to numpy array when c++ object is deleted.
return
py
::
array
(
dtype
,
shape_vec
,
{},
tensor
.
raw_data
());
}
}
// namespace tv
include/tensorview/tensor.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
tv::Tensor is a lightweight header-only tensor container
without template and annoying dependencies. no algorithm is implemented.
it should only be used when you want a no-template simple container but
dont want to link with libtorch.
If you can use libtorch, dont use tv::Tensor.
*/
#pragma once
#include "cc17.h"
#include "mp_helper.h"
#include "tensorview.h"
#include <cstring>
#include <iomanip>
#include <memory>
#include <type_traits>
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
namespace
tv
{
enum
DType
{
float32
,
int32
,
int16
,
int8
,
float64
,
bool_
,
uint8
,
float16
,
int64
,
uint16
,
uint32
,
uint64
};
namespace
detail
{
using
dtype_collection_t
=
tv
::
mp_list_c
<
int
,
float32
,
int32
,
int16
,
int8
,
float64
,
bool_
,
uint8
,
float16
,
int64
,
uint16
,
uint32
,
uint64
>
;
#ifdef TV_CUDA
using
all_tensor_types_t
=
std
::
tuple
<
float
,
double
,
int8_t
,
int16_t
,
int32_t
,
int64_t
,
uint8_t
,
uint16_t
,
uint32_t
,
uint64_t
,
bool
>
;
#else
using
all_tensor_types_t
=
std
::
tuple
<
float
,
double
,
int8_t
,
int16_t
,
int32_t
,
int64_t
,
uint8_t
,
uint16_t
,
uint32_t
,
uint64_t
,
bool
>
;
#endif
template
<
typename
T
>
class
TensorStorage
{
public:
TensorStorage
(
size_t
size
,
int
device
=
-
1
,
bool
managed
=
false
,
bool
pinned
=
false
)
:
mSize
(
size
),
device_
(
device
),
managed_
(
managed
),
pinned_
(
pinned
)
{
if
(
size
==
0
)
{
mPtr
=
nullptr
;
}
else
{
if
(
device
==
-
1
)
{
if
(
pinned_
)
{
#ifdef TV_CUDA
checkCudaErrors
(
cudaMallocHost
(
&
mPtr
,
size
*
sizeof
(
T
)));
#else
TV_THROW_INVALID_ARG
(
"you need to define TV_CUDA to use pinned"
);
#endif
}
else
{
mPtr
=
new
T
[
size
];
}
}
else
{
#ifdef TV_CUDA
// we should select device in external
/*
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
*/
if
(
managed
)
{
checkCudaErrors
(
cudaMallocManaged
(
&
this
->
mPtr
,
size
*
sizeof
(
T
)));
}
else
{
checkCudaErrors
(
cudaMalloc
(
&
mPtr
,
size
*
sizeof
(
T
)));
}
#else
TV_THROW_INVALID_ARG
(
"don't compiled with cuda"
);
#endif
}
}
}
TensorStorage
(
T
*
ptr
,
size_t
size
,
int
device
)
:
mSize
(
size
),
mPtr
(
ptr
),
from_blob_
(
true
),
device_
(
device
)
{}
virtual
~
TensorStorage
()
{
if
(
empty
())
{
return
;
}
if
(
from_blob_
)
{
return
;
}
if
(
device_
==
-
1
)
{
if
(
pinned_
)
{
#ifdef TV_CUDA
cudaFreeHost
(
mPtr
);
#endif
}
else
{
delete
[]
mPtr
;
}
}
else
{
#ifdef TV_CUDA
cudaFree
(
mPtr
);
#endif
}
};
inline
size_t
size
()
const
{
return
mSize
;
}
T
*
data
()
{
return
mPtr
;
}
const
T
*
data
()
const
{
return
mPtr
;
}
bool
empty
()
const
{
return
mPtr
==
nullptr
||
mSize
==
0
;
}
bool
managed
()
const
{
return
managed_
;
}
bool
pinned
()
const
{
return
pinned_
;
}
int
device
()
const
{
return
device_
;
}
void
zero_
()
{
if
(
device_
==
-
1
)
{
std
::
memset
(
data
(),
0
,
mSize
);
// std::fill(data(), data() + mSize, 0);
}
else
{
#ifdef TV_CUDA
checkCudaErrors
(
cudaMemset
(
data
(),
0
,
mSize
/
sizeof
(
T
)));
#else
TV_THROW_INVALID_ARG
(
"don't compiled with cuda"
);
#endif
}
}
private:
size_t
mSize
=
0
;
T
*
mPtr
=
nullptr
;
bool
from_blob_
=
false
;
int
device_
=
-
1
;
bool
managed_
=
false
;
bool
pinned_
=
false
;
};
template
<
typename
T
>
size_t
sizeof_dtype
(
T
dtype
)
{
switch
(
dtype
)
{
case
float32
:
return
sizeof
(
float
);
case
int8
:
return
sizeof
(
int8_t
);
case
int16
:
return
sizeof
(
int16_t
);
case
int32
:
return
sizeof
(
int32_t
);
case
float64
:
return
sizeof
(
double
);
case
int64
:
return
sizeof
(
int64_t
);
case
bool_
:
return
sizeof
(
bool
);
case
uint8
:
return
sizeof
(
uint8_t
);
case
uint16
:
return
sizeof
(
uint16_t
);
case
uint32
:
return
sizeof
(
uint32_t
);
case
uint64
:
return
sizeof
(
uint64_t
);
case
float16
:
return
2
;
default:
TV_THROW_RT_ERR
(
"unsupported dtype"
);
}
return
0
;
}
template
<
typename
T
>
std
::
string
typeString
(
T
t
)
{
switch
(
t
)
{
case
DType
::
bool_
:
return
"bool"
;
case
DType
::
float32
:
return
"float32"
;
case
DType
::
int8
:
return
"int8"
;
case
DType
::
int16
:
return
"int16"
;
case
DType
::
int32
:
return
"int32"
;
case
DType
::
float64
:
return
"float64"
;
case
DType
::
int64
:
return
"int64"
;
case
DType
::
uint8
:
return
"uint8"
;
case
DType
::
uint16
:
return
"uint16"
;
case
DType
::
uint32
:
return
"uint32"
;
case
DType
::
uint64
:
return
"uint64"
;
case
DType
::
float16
:
return
"half"
;
default:
return
""
;
}
}
template
<
typename
T
>
struct
TypeToDtypeTraits
;
template
<
>
struct
TypeToDtypeTraits
<
int32_t
>
{
static
constexpr
DType
dtype
=
int32
;
};
#ifdef TV_CUDA
template
<
>
struct
TypeToDtypeTraits
<
__half
>
{
static
constexpr
DType
dtype
=
float16
;
};
#endif
template
<
>
struct
TypeToDtypeTraits
<
float
>
{
static
constexpr
DType
dtype
=
float32
;
};
template
<
>
struct
TypeToDtypeTraits
<
double
>
{
static
constexpr
DType
dtype
=
float64
;
};
template
<
>
struct
TypeToDtypeTraits
<
int16_t
>
{
static
constexpr
DType
dtype
=
int16
;
};
template
<
>
struct
TypeToDtypeTraits
<
int8_t
>
{
static
constexpr
DType
dtype
=
int8
;
};
template
<
>
struct
TypeToDtypeTraits
<
int64_t
>
{
static
constexpr
DType
dtype
=
int64
;
};
template
<
>
struct
TypeToDtypeTraits
<
uint8_t
>
{
static
constexpr
DType
dtype
=
uint8
;
};
template
<
>
struct
TypeToDtypeTraits
<
uint16_t
>
{
static
constexpr
DType
dtype
=
uint16
;
};
template
<
>
struct
TypeToDtypeTraits
<
uint32_t
>
{
static
constexpr
DType
dtype
=
uint32
;
};
template
<
>
struct
TypeToDtypeTraits
<
uint64_t
>
{
static
constexpr
DType
dtype
=
uint64
;
};
template
<
>
struct
TypeToDtypeTraits
<
bool
>
{
static
constexpr
DType
dtype
=
bool_
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
int32_t
>
{
static
constexpr
DType
dtype
=
int32
;
};
#ifdef TV_CUDA
template
<
>
struct
TypeToDtypeTraits
<
const
__half
>
{
static
constexpr
DType
dtype
=
float16
;
};
#endif
template
<
>
struct
TypeToDtypeTraits
<
const
float
>
{
static
constexpr
DType
dtype
=
float32
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
double
>
{
static
constexpr
DType
dtype
=
float64
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
int16_t
>
{
static
constexpr
DType
dtype
=
int16
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
int8_t
>
{
static
constexpr
DType
dtype
=
int8
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
int64_t
>
{
static
constexpr
DType
dtype
=
int64
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
uint8_t
>
{
static
constexpr
DType
dtype
=
uint8
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
uint16_t
>
{
static
constexpr
DType
dtype
=
uint16
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
uint32_t
>
{
static
constexpr
DType
dtype
=
uint32
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
uint64_t
>
{
static
constexpr
DType
dtype
=
uint64
;
};
template
<
>
struct
TypeToDtypeTraits
<
const
bool
>
{
static
constexpr
DType
dtype
=
bool_
;
};
}
// namespace detail
template
<
class
T
>
constexpr
DType
type_v
=
detail
::
TypeToDtypeTraits
<
T
>::
dtype
;
template
<
class
...
Ts
,
typename
F
>
bool
dispatch_noexcept
(
DType
t
,
F
&&
f
)
{
static_assert
(
sizeof
...(
Ts
)
>
0
,
"you need to provide at least one type"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
type_v
<
decltype
(
I
)
>
==
t
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
decltype
(
I
)());
notFound
=
false
;
}
});
return
!
notFound
;
}
template
<
class
...
Ts
,
typename
F
>
void
dispatch
(
DType
t
,
F
&&
f
)
{
if
(
!
dispatch_noexcept
<
Ts
...
>
(
t
,
std
::
forward
<
F
>
(
f
)))
{
std
::
stringstream
ss
;
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
ss
](
auto
I
)
{
ss
<<
detail
::
TypeToString
<
decltype
(
I
)
>::
value
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown type"
,
detail
::
typeString
(
t
),
", available:"
,
ss
.
str
());
}
}
template
<
typename
T
,
T
...
Is
,
typename
F
>
void
dispatch_scalar
(
T
idx
,
F
&&
f
)
{
static_assert
(
sizeof
...(
Is
)
>
0
,
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
T
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
T
(
I
)
==
idx
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
}
});
if
(
notFound
)
{
std
::
stringstream
ss
;
mp_for_each
<
mp_list_c
<
T
,
Is
...
>>
([
=
,
&
ss
](
auto
I
)
{
ss
<<
T
(
I
)
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
}
}
template
<
int
...
Is
,
typename
F
>
bool
dispatch_int_noexcept
(
int
idx
,
F
&&
f
)
{
static_assert
(
sizeof
...(
Is
)
>
0
,
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
decltype
(
I
)
::
value
==
idx
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
}
});
return
!
notFound
;
}
template
<
int
...
Is
,
typename
F
,
class
BinaryPredicate
>
bool
dispatch_int_noexcept
(
int
idx
,
BinaryPredicate
p
,
F
&&
f
)
{
static_assert
(
sizeof
...(
Is
)
>
0
,
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
p
(
idx
,
decltype
(
I
)
::
value
)
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
}
});
return
!
notFound
;
}
template
<
int
...
Is
,
typename
F
>
void
dispatch_int
(
int
idx
,
F
&&
f
)
{
if
(
!
dispatch_int_noexcept
<
Is
...
>
(
idx
,
std
::
forward
<
F
>
(
f
)))
{
std
::
stringstream
ss
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
(
[
=
,
&
ss
](
auto
I
)
{
ss
<<
decltype
(
I
)
::
value
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
}
}
template
<
int
...
Is
,
typename
F
,
class
BinaryPredicate
>
void
dispatch_int
(
int
idx
,
BinaryPredicate
p
,
F
&&
f
)
{
// BinaryPredicate: BinaryPredicate(idx, candidate)
if
(
!
dispatch_int_noexcept
<
Is
...
>
(
idx
,
p
,
std
::
forward
<
F
>
(
f
)))
{
std
::
stringstream
ss
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
(
[
=
,
&
ss
](
auto
I
)
{
ss
<<
decltype
(
I
)
::
value
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
}
}
// Ts is pack of mp_list_c
template
<
class
...
Ts
,
typename
Iterator
,
typename
F
>
bool
dispatch_container_noexcept
(
Iterator
begin
,
Iterator
end
,
F
&&
f
)
{
static_assert
(
sizeof
...(
Ts
)
>
0
,
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
using
val_lst_t
=
decltype
(
I
);
auto
val_lst_size
=
mp_size
<
val_lst_t
>::
value
;
bool
equal
=
true
;
std
::
size_t
count
=
0
;
auto
iter
=
begin
;
mp_for_each
<
val_lst_t
>
([
&
](
auto
E
)
{
if
(
iter
==
end
||
!
equal
)
{
return
;
}
if
(
count
>=
val_lst_size
)
{
equal
=
false
;
return
;
}
constexpr
auto
c
=
decltype
(
E
)
::
value
;
if
(
c
!=
*
iter
)
{
equal
=
false
;
}
++
count
;
std
::
advance
(
iter
,
1
);
});
if
(
count
!=
val_lst_size
||
iter
!=
end
)
{
equal
=
false
;
}
if
(
equal
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
}
});
return
!
notFound
;
}
template
<
class
...
Ts
,
typename
Iterator
,
typename
F
>
void
dispatch_container
(
Iterator
begin
,
Iterator
end
,
F
&&
f
)
{
if
(
!
dispatch_container_noexcept
<
Ts
...
>
(
begin
,
end
,
std
::
forward
<
F
>
(
f
)))
{
std
::
stringstream
ss
;
ss
<<
"unknown value ["
;
for
(
auto
iter
=
begin
;
iter
!=
end
;
std
::
advance
(
iter
,
1
))
{
ss
<<
*
iter
<<
","
;
}
ss
<<
"], available: "
;
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
ss
](
auto
I
)
{
ss
<<
"["
;
mp_for_each
<
decltype
(
I
)
>
(
[
=
,
&
ss
](
auto
E
)
{
ss
<<
decltype
(
E
)
::
value
<<
","
;
});
ss
<<
"]"
;
});
TV_THROW_RT_ERR
(
ss
.
str
());
}
}
/*
template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
return dispatch_scalar<int, Is...>(idx, f);
}
*/
template
<
class
T
>
struct
Dispatch
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
Dispatch
<
T
<
Args
...
>>
{
template
<
typename
F
>
inline
void
operator
()(
DType
t
,
F
&&
f
)
{
return
dispatch
<
Args
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
};
template
<
class
T
>
struct
DispatchContainer
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchContainer
<
T
<
Args
...
>>
{
template
<
typename
Iterator
,
typename
F
>
inline
void
operator
()(
Iterator
begin
,
Iterator
end
,
F
&&
f
)
{
return
dispatch_container
<
Args
...
>
(
begin
,
end
,
std
::
forward
<
F
>
(
f
));
}
};
template
<
class
T
>
struct
DispatchContainerNoexcept
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchContainerNoexcept
<
T
<
Args
...
>>
{
template
<
typename
Iterator
,
typename
F
>
inline
bool
operator
()(
Iterator
begin
,
Iterator
end
,
F
&&
f
)
{
return
dispatch_container_noexcept
<
Args
...
>
(
begin
,
end
,
std
::
forward
<
F
>
(
f
));
}
};
template
<
class
T
>
struct
DispatchInt
;
// Args should be std::integral_constant<int, value>
// you need to use type_container<std::integral_constant<int, value>...>
// as template parameter of DispatchInt.
// tv::mp_list_c is ok.
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchInt
<
T
<
Args
...
>>
{
template
<
typename
F
>
inline
void
operator
()(
int
t
,
F
&&
f
)
{
return
dispatch_int
<
Args
::
value
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
template
<
typename
F
,
typename
BinaryPredicate
>
inline
void
operator
()(
int
t
,
BinaryPredicate
p
,
F
&&
f
)
{
return
dispatch_int
<
Args
::
value
...
>
(
t
,
p
,
std
::
forward
<
F
>
(
f
));
}
};
template
<
class
T
>
struct
DispatchIntNoexcept
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchIntNoexcept
<
T
<
Args
...
>>
{
template
<
typename
F
>
inline
bool
operator
()(
int
t
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
template
<
typename
F
,
typename
BinaryPredicate
>
inline
bool
operator
()(
int
t
,
BinaryPredicate
p
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
p
,
std
::
forward
<
F
>
(
f
));
}
};
constexpr
size_t
kTensorMaxDim
=
10
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
struct
Tensor
{
Tensor
()
{}
Tensor
(
TensorShape
shape
,
TensorShape
stride
,
DType
dtype
,
int
device
=
-
1
,
bool
pinned
=
false
,
bool
managed
=
false
)
:
dtype_
(
dtype
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
,
managed
,
pinned
);
shape_
=
shape
;
stride_
=
stride
;
}
Tensor
(
TensorShape
shape
,
DType
dtype
,
int
device
=
-
1
,
bool
pinned
=
false
,
bool
managed
=
false
)
:
dtype_
(
dtype
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
,
managed
,
pinned
);
shape_
=
shape
;
stride_
=
shape
.
stride_rowmajor
();
}
Tensor
(
void
*
ptr
,
TensorShape
shape
,
TensorShape
stride
,
DType
dtype
,
int
device
=
-
1
)
:
dtype_
(
dtype
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
reinterpret_cast
<
uint8_t
*>
(
ptr
),
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
);
shape_
=
shape
;
stride_
=
stride
;
}
Tensor
(
void
*
ptr
,
TensorShape
shape
,
DType
dtype
,
int
device
=
-
1
)
:
dtype_
(
dtype
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
reinterpret_cast
<
uint8_t
*>
(
ptr
),
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
);
shape_
=
shape
;
stride_
=
shape
.
stride_rowmajor
();
}
Tensor
(
const
void
*
ptr
,
TensorShape
shape
,
TensorShape
stride
,
DType
dtype
,
int
device
=
-
1
)
:
dtype_
(
dtype
),
writeable_
(
false
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
reinterpret_cast
<
uint8_t
*>
(
const_cast
<
void
*>
(
ptr
)),
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
);
shape_
=
shape
;
stride_
=
stride
;
}
Tensor
(
const
void
*
ptr
,
TensorShape
shape
,
DType
dtype
,
int
device
=
-
1
)
:
dtype_
(
dtype
),
writeable_
(
false
)
{
TV_ASSERT_INVALID_ARG
(
!
shape
.
empty
(),
"dont support empty shape"
);
storage_
=
std
::
make_shared
<
detail
::
TensorStorage
<
uint8_t
>>
(
reinterpret_cast
<
uint8_t
*>
(
const_cast
<
void
*>
(
ptr
)),
shape
.
size
()
*
detail
::
sizeof_dtype
(
dtype
),
device
);
shape_
=
shape
;
stride_
=
shape
.
stride_rowmajor
();
}
Tensor
(
std
::
initializer_list
<
int32_t
>
init
)
:
Tensor
({
int
(
init
.
size
())},
tv
::
int32
)
{
std
::
copy
(
init
.
begin
(),
init
.
end
(),
data
<
int32_t
>
());
}
Tensor
(
std
::
initializer_list
<
int64_t
>
init
)
:
Tensor
({
int
(
init
.
size
())},
tv
::
int64
)
{
std
::
copy
(
init
.
begin
(),
init
.
end
(),
data
<
int64_t
>
());
}
Tensor
(
std
::
initializer_list
<
float
>
init
)
:
Tensor
({
int
(
init
.
size
())},
tv
::
float32
)
{
std
::
copy
(
init
.
begin
(),
init
.
end
(),
data
<
float
>
());
}
Tensor
(
std
::
initializer_list
<
double
>
init
)
:
Tensor
({
int
(
init
.
size
())},
tv
::
float64
)
{
std
::
copy
(
init
.
begin
(),
init
.
end
(),
data
<
double
>
());
}
template
<
typename
T
,
int
Rank
=
-
1
,
template
<
class
>
class
PtrTraits
=
DefaultPtrTraits
,
typename
Tindex
=
int
,
typename
std
::
enable_if
<
(
Rank
>
0
),
int
>::
type
=
0
>
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
tview
()
{
using
tv_shape_t
=
typename
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>::
tv_shape_t
;
writable_check
();
static_assert
(
Rank
==
-
1
||
Rank
>
0
,
"error"
);
TV_ASSERT_RT_ERR
(
dtype_
==
type_v
<
T
>
,
"error"
);
tv_shape_t
shape
(
Rank
),
stride
(
Rank
);
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
shape
[
i
]
=
shape_
[
i
];
stride
[
i
]
=
stride_
[
i
];
}
return
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
reinterpret_cast
<
T
*>
(
data
<
T
>
()),
shape
,
stride
);
}
template
<
typename
T
,
int
Rank
=
-
1
,
template
<
class
>
class
PtrTraits
=
DefaultPtrTraits
,
typename
Tindex
=
int
>
TensorView
<
const
std
::
remove_const_t
<
T
>
,
Rank
,
PtrTraits
,
Tindex
>
tview
()
const
{
static_assert
(
Rank
==
-
1
||
Rank
>
0
,
"error"
);
TV_ASSERT_RT_ERR
(
dtype_
==
type_v
<
T
>
,
"error"
);
return
if_constexpr
<
(
Rank
>
0
)
>
(
[
&
](
auto
_
)
{
TV_ASSERT_RT_ERR
(
Rank
==
ndim
(),
"error"
);
ShapeBase
<
_
(
Rank
)
==
-
1
?
TV_MAX_DIM
:
Rank
,
Tindex
>
shape
(
Rank
),
stride
(
Rank
);
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
shape
[
i
]
=
shape_
[
i
];
stride
[
i
]
=
stride_
[
i
];
}
return
TensorView
<
const
std
::
remove_const_t
<
T
>
,
Rank
,
PtrTraits
,
Tindex
>
(
reinterpret_cast
<
const
std
::
remove_const_t
<
T
>
*>
(
data
<
T
>
()),
shape
,
stride
);
},
[
&
](
auto
_
)
{
ShapeBase
<
TV_MAX_DIM
,
Tindex
>
shape
(
_
(
ndim
())),
stride
(
ndim
());
for
(
int
i
=
0
;
i
<
int
(
ndim
());
++
i
)
{
shape
[
i
]
=
shape_
[
i
];
stride
[
i
]
=
stride_
[
i
];
}
return
TensorView
<
const
std
::
remove_const_t
<
T
>
,
Rank
,
PtrTraits
,
Tindex
>
(
reinterpret_cast
<
const
std
::
remove_const_t
<
T
>
*>
(
data
<
T
>
()),
shape
,
stride
);
});
}
template
<
class
...
Inds
>
Tensor
view
(
Inds
...
newShapes
)
const
{
static_assert
(
sizeof
...(
newShapes
)
>
0
,
"dont support empty for now"
);
TensorShape
shape
{
int
(
newShapes
)...};
bool
found_minus_1
=
false
;
for
(
size_t
i
=
0
;
i
<
shape
.
ndim
();
++
i
)
{
if
(
!
found_minus_1
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
1
;
shape
[
i
]
=
size
()
/
shape
.
size
();
found_minus_1
=
true
;
}
else
{
TV_ASSERT_INVALID_ARG
(
shape
[
i
]
>
0
,
"shape except -1 must larger than 0"
);
}
}
else
{
TV_ASSERT_INVALID_ARG
(
shape
[
i
]
>
0
,
"multiple -1 in your argument."
);
}
}
TV_ASSERT_RT_ERR
(
shape
.
size
()
==
size
(),
"error"
);
Tensor
res
(
*
this
);
res
.
shape_
=
shape
;
res
.
stride_
=
shape
.
stride_rowmajor
();
return
res
;
}
Tensor
view
(
TensorShape
shape
)
const
{
TV_ASSERT_RT_ERR
(
shape
.
size
()
==
size
(),
"error"
);
Tensor
res
(
*
this
);
res
.
shape_
=
shape
;
res
.
stride_
=
shape
.
stride_rowmajor
();
return
res
;
}
Tensor
operator
[](
int64_t
index
)
{
TV_ASSERT_INVALID_ARG
(
ndim
()
>
1
,
"error"
);
if
(
index
<
0
)
{
index
+=
dim
(
0
);
}
TV_ASSERT_INVALID_ARG
(
index
<
dim
(
0
),
"error"
);
Tensor
res
=
Tensor
();
res
.
storage_
=
storage_
;
res
.
shape_
=
shape_
.
subshape
(
1
);
res
.
offset_
=
offset_
+
index
*
stride_
[
0
];
res
.
stride_
=
stride_
.
subshape
(
1
);
res
.
writeable_
=
writeable_
;
return
res
;
}
Tensor
squeeze
()
const
{
return
view
(
shape_
.
squeeze
());
}
Tensor
squeeze
(
int
axis
)
const
{
if
(
axis
<
0
)
{
axis
=
ndim
()
+
axis
;
}
return
view
(
shape_
.
squeeze
(
axis
));
}
Tensor
unsqueeze
(
int
axis
)
const
{
if
(
axis
<
0
)
{
axis
=
ndim
()
+
axis
;
}
return
view
(
shape_
.
unsqueeze
(
axis
));
}
bool
pinned
()
const
{
return
storage_
->
pinned
();
}
Tensor
slice_first_axis
(
int
start
,
int
end
)
const
{
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
if
(
start
<
0
)
{
start
=
shape_
[
0
]
+
start
;
}
if
(
end
<
0
)
{
end
=
shape_
[
0
]
+
end
;
}
TV_ASSERT_INVALID_ARG
(
start
<
shape_
[
0
],
"start must small than dim 0"
);
TV_ASSERT_INVALID_ARG
(
start
<
end
,
"start must small than end"
);
size_t
new_offset
=
start
*
shape_
.
prod
(
1
)
*
itemsize
();
Tensor
res
(
*
this
);
TensorShape
newshape
(
shape_
);
newshape
[
0
]
=
end
-
start
;
res
.
shape_
=
newshape
;
res
.
stride_
=
stride_
;
res
.
offset_
=
new_offset
;
return
res
;
}
bool
empty
()
const
{
return
storage_
->
empty
();
}
DType
dtype
()
const
{
return
dtype_
;
}
int
device
()
const
{
return
storage_
->
device
();
}
size_t
ndim
()
const
{
return
shape_
.
ndim
();
}
const
TensorShape
&
shape
()
const
{
return
shape_
;
}
const
TensorShape
&
sizes
()
const
{
return
shape_
;
}
const
TensorShape
&
stride
()
const
{
return
stride_
;
}
int
dim
(
int
idx
)
const
{
if
(
idx
<
0
)
{
TV_ASSERT_RT_ERR
(
shape_
.
ndim
()
+
idx
<
shape_
.
ndim
(),
idx
,
shape_
);
return
shape_
[
shape_
.
ndim
()
+
idx
];
}
else
{
TV_ASSERT_RT_ERR
(
idx
<
int
(
shape_
.
ndim
()),
idx
,
shape_
);
return
shape_
[
idx
];
}
}
const
uint8_t
*
raw_data
()
const
{
return
storage_
->
data
()
+
offset_
;
}
size_t
raw_size
()
const
{
return
size
()
*
itemsize
();
}
size_t
size
()
const
{
return
shape_
.
size
();
}
size_t
size
(
int64_t
idx
)
const
{
return
dim
(
idx
);
}
size_t
itemsize
()
const
{
return
detail
::
sizeof_dtype
(
dtype_
);
}
Tensor
&
zero_
()
{
writable_check
();
storage_
->
zero_
();
return
*
this
;
}
uint8_t
*
raw_data
()
{
writable_check
();
return
storage_
->
data
()
+
offset_
;
}
template
<
typename
T
>
Tensor
&
fill_
(
T
value
)
{
writable_check
();
TV_ASSERT_RT_ERR
(
device
()
==
-
1
,
"error"
);
Dispatch
<
detail
::
all_tensor_types_t
>
()(
dtype_
,
[
&
](
auto
I
)
{
using
Treal
=
decltype
(
I
);
if
(
std
::
is_convertible
<
T
,
Treal
>::
value
)
{
auto
ptr
=
reinterpret_cast
<
Treal
*>
(
raw_data
());
std
::
fill
(
ptr
,
ptr
+
size
(),
Treal
(
value
));
}
else
{
TV_THROW_INVALID_ARG
(
"not convertable from"
,
type_s
<
T
>
,
"to"
,
type_s
<
Treal
>
);
}
});
return
*
this
;
}
template
<
typename
T
>
T
*
data
()
{
TV_ASSERT_RT_ERR
(
dtype_
==
type_v
<
T
>
,
"error"
);
writable_check
();
return
reinterpret_cast
<
T
*>
(
raw_data
());
}
template
<
typename
T
>
const
T
*
data
()
const
{
TV_ASSERT_RT_ERR
(
dtype_
==
type_v
<
T
>
,
"error"
);
return
reinterpret_cast
<
const
T
*>
(
raw_data
());
}
template
<
typename
T
>
T
*
data_ptr
()
{
return
data
<
T
>
();
}
template
<
typename
T
>
const
T
*
data_ptr
()
const
{
return
data
<
T
>
();
}
void
*
data_ptr
()
{
return
reinterpret_cast
<
void
*>
(
raw_data
());
}
const
void
*
data_ptr
()
const
{
return
reinterpret_cast
<
const
void
*>
(
raw_data
());
}
void
copy_
(
const
Tensor
&
tensor
)
{
writable_check
();
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
TV_ASSERT_RT_ERR
(
!
empty
()
&&
!
tensor
.
empty
(),
"must not empty"
);
TV_ASSERT_RT_ERR
(
size
()
==
tensor
.
size
(),
"must have same size"
);
TV_ASSERT_RT_ERR
(
dtype
()
==
tensor
.
dtype
(),
"must have same dtype"
,
detail
::
typeString
(
dtype
()),
detail
::
typeString
(
tensor
.
dtype
()));
if
(
device
()
==
-
1
&&
tensor
.
device
()
==
-
1
)
{
#ifdef TV_CUDA
host2host
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
));
#else
std
::
copy
(
tensor
.
raw_data
(),
tensor
.
raw_data
()
+
size
()
*
detail
::
sizeof_dtype
(
dtype_
),
storage_
->
data
());
#endif
}
#ifdef TV_CUDA
else
if
(
device
()
>=
0
&&
tensor
.
device
()
==
-
1
)
{
host2dev
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
));
}
else
if
(
device
()
==
-
1
&&
tensor
.
device
()
>=
0
)
{
dev2host
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
));
}
else
if
(
device
()
>=
0
&&
tensor
.
device
()
>=
0
)
{
dev2dev
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
));
}
#endif
else
{
TV_THROW_RT_ERR
(
"only support cpu tensor"
);
}
}
#ifdef TV_CUDA
void
copy_
(
const
Tensor
&
tensor
,
cudaStream_t
stream
)
{
writable_check
();
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
TV_ASSERT_RT_ERR
(
!
empty
()
&&
!
tensor
.
empty
(),
"must not empty"
);
TV_ASSERT_RT_ERR
(
size
()
==
tensor
.
size
(),
"must have same size"
);
TV_ASSERT_RT_ERR
(
dtype
()
==
tensor
.
dtype
(),
"must have same dtype"
,
detail
::
typeString
(
dtype
()),
detail
::
typeString
(
tensor
.
dtype
()));
if
(
device
()
==
-
1
&&
tensor
.
device
()
==
-
1
)
{
host2host
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
),
stream
);
}
else
if
(
device
()
>=
0
&&
tensor
.
device
()
==
-
1
)
{
host2dev
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
),
stream
);
}
else
if
(
device
()
==
-
1
&&
tensor
.
device
()
>=
0
)
{
dev2host
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
),
stream
);
}
else
if
(
device
()
>=
0
&&
tensor
.
device
()
>=
0
)
{
dev2dev
(
storage_
->
data
(),
tensor
.
raw_data
(),
size
()
*
detail
::
sizeof_dtype
(
dtype_
),
stream
);
}
else
{
TV_THROW_RT_ERR
(
"only support cpu tensor"
);
}
}
#endif
Tensor
cpu
()
const
{
if
(
storage_
->
device
()
==
-
1
)
{
// cpu() should always copy tensor.
return
clone
();
}
Tensor
res
(
shape_
,
stride_
,
dtype_
,
-
1
,
storage_
->
managed
());
res
.
copy_
(
*
this
);
return
res
;
}
template
<
typename
T
>
void
copy_
(
const
TensorView
<
T
>
&
tensor
,
int
device
)
{
writable_check
();
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
Tensor
src
=
from_blob
(
tensor
,
device
);
return
copy_
(
src
);
}
Tensor
&
operator
=
(
const
Tensor
&
tensor
)
{
dtype_
=
tensor
.
dtype_
;
storage_
=
tensor
.
storage_
;
shape_
=
tensor
.
shape_
;
writeable_
=
tensor
.
writeable_
;
offset_
=
tensor
.
offset_
;
stride_
=
tensor
.
stride_
;
return
*
this
;
}
Tensor
(
const
Tensor
&
tensor
)
{
dtype_
=
tensor
.
dtype_
;
storage_
=
tensor
.
storage_
;
shape_
=
tensor
.
shape_
;
writeable_
=
tensor
.
writeable_
;
offset_
=
tensor
.
offset_
;
stride_
=
tensor
.
stride_
;
}
Tensor
clone
(
bool
pinned
=
false
)
const
{
TV_ASSERT_RT_ERR
(
!
empty
(),
"clone a empty tensor"
);
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
Tensor
newtensor
(
shape_
,
stride_
,
dtype_
,
device
(),
pinned
,
storage_
->
managed
());
newtensor
.
copy_
(
*
this
);
return
newtensor
;
}
Tensor
astype
(
DType
dtype
)
{
if
(
dtype
==
dtype_
)
{
return
clone
();
}
TV_ASSERT_INVALID_ARG
(
device
()
==
-
1
,
"only support cpu tensor"
);
TV_ASSERT_INVALID_ARG
(
!
empty
(),
"can't be used in empty tensor"
);
TV_ASSERT_INVALID_ARG
(
contiguous_
,
"only support contiguous for now"
);
auto
tensor
=
Tensor
();
Dispatch
<
detail
::
all_tensor_types_t
>
()(
dtype
,
[
&
](
auto
Idst
)
{
using
Tdst
=
decltype
(
Idst
);
Dispatch
<
detail
::
all_tensor_types_t
>
()(
this
->
dtype_
,
[
&
](
auto
Icur
)
{
using
Tcur
=
decltype
(
Icur
);
if
(
std
::
is_convertible
<
Tcur
,
Tdst
>::
value
)
{
auto
ptr
=
this
->
data
<
Tcur
>
();
tensor
=
Tensor
(
this
->
shape_
,
this
->
stride_
,
dtype
,
this
->
device
(),
this
->
pinned
(),
this
->
storage_
->
managed
());
std
::
copy
(
ptr
,
ptr
+
this
->
size
(),
tensor
.
data
<
Tdst
>
());
}
else
{
TV_THROW_INVALID_ARG
(
"not convertable from"
,
type_s
<
Tcur
>
,
"to"
,
type_s
<
Tdst
>
);
}
});
});
return
tensor
;
}
template
<
class
...
Ts
,
typename
F
>
inline
void
dispatch
(
F
&&
f
)
{
return
tv
::
dispatch
<
Ts
...
>
(
dtype_
,
std
::
forward
<
F
>
(
f
));
}
protected:
inline
void
writable_check
()
{
TV_ASSERT_RT_ERR
(
writeable_
,
"you cant do non-const operation when not writable"
);
}
DType
dtype_
;
std
::
shared_ptr
<
detail
::
TensorStorage
<
uint8_t
>>
storage_
;
TensorShape
shape_
;
size_t
offset_
=
0
;
TensorShape
stride_
;
private:
bool
writeable_
=
true
;
bool
contiguous_
=
true
;
};
template
<
typename
Os
>
Os
&
operator
<<
(
Os
&
os
,
const
Tensor
&
tensor
)
{
TV_ASSERT_INVALID_ARG
(
tensor
.
device
()
==
-
1
,
"must be cpu tensor"
);
Dispatch
<
detail
::
all_tensor_types_t
>
()(
tensor
.
dtype
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
std
::
stringstream
ss
;
if
(
std
::
is_same
<
T
,
float
>::
value
||
std
::
is_same
<
T
,
double
>::
value
)
{
ss
<<
std
::
setprecision
(
4
);
}
os
<<
tensor
.
tview
<
T
,
-
1
,
DefaultPtrTraits
,
int64_t
>
().
repr
(
ss
);
});
return
os
;
}
inline
Tensor
from_blob
(
void
*
ptr
,
TensorShape
shape
,
DType
dtype
,
int
device
)
{
return
Tensor
(
ptr
,
shape
,
dtype
,
device
);
}
inline
Tensor
from_blob
(
const
void
*
ptr
,
TensorShape
shape
,
DType
dtype
,
int
device
)
{
return
Tensor
(
ptr
,
shape
,
dtype
,
device
);
}
}
// namespace tv
\ No newline at end of file
include/tensorview/tensorview.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "common.h"
#include "prettyprint.h"
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <iterator>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>
#ifdef TV_CUDA
#include <cuda_runtime_api.h>
#endif
namespace
tv
{
#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR_V2(...) \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#ifdef TV_CUDA
struct
GPU
{
GPU
(
cudaStream_t
s
=
0
)
:
mStream
(
s
)
{}
virtual
cudaStream_t
getStream
()
const
{
return
mStream
;
}
cudaStream_t
mStream
=
0
;
};
#endif
struct
CPU
{};
#ifndef TV_MAX_DIM
#define TV_MAX_DIM 6
#endif
template
<
typename
T
>
struct
DefaultPtrTraits
{
typedef
T
*
type
;
};
#if defined(__CUDACC__) || defined(__HIPCC__)
template
<
typename
T
>
struct
RestrictPtrTraits
{
typedef
T
*
__restrict__
type
;
};
#endif
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
{
if (ndim * sizeof(T) == 1)
return 1;
else if (ndim * sizeof(T) == 2)
return 2;
else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
return 4;
else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
return 8;
else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
return 16;
else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
return 32;
else
return 64;
}
*/
namespace
detail
{
template
<
typename
_InIter
>
using
_RequireInputIter
=
typename
std
::
enable_if
<
std
::
is_convertible
<
typename
std
::
iterator_traits
<
_InIter
>::
iterator_category
,
std
::
input_iterator_tag
>::
value
>::
type
;
}
template
<
typename
T
,
size_t
MaxDim
=
TV_MAX_DIM
>
struct
/*alignas(calc_align<T>(MaxDim))*/
SimpleVector
{
public:
TV_HOST_DEVICE_INLINE
SimpleVector
(){};
TV_HOST_DEVICE_INLINE
SimpleVector
(
size_t
count
,
T
init
=
T
())
:
size_
(
count
)
{
for
(
size_t
i
=
0
;
i
<
count
;
++
i
)
{
array_
[
i
]
=
init
;
}
};
template
<
typename
Iterator
,
typename
=
detail
::
_RequireInputIter
<
Iterator
>
>
SimpleVector
(
Iterator
first
,
Iterator
last
)
{
size_
=
0
;
for
(;
first
!=
last
;
++
first
)
{
if
(
size_
>=
MaxDim
)
{
TV_THROW_INVALID_ARG
(
"iterator too long"
);
}
array_
[
size_
++
]
=
*
first
;
}
};
TV_HOST_DEVICE_INLINE
SimpleVector
(
std
::
initializer_list
<
T
>
q
)
{
TV_ASSERT
(
q
.
size
()
<=
MaxDim
);
size_
=
0
;
for
(
T
s
:
q
)
{
array_
[
size_
++
]
=
s
;
}
size_
=
q
.
size
();
}
SimpleVector
(
const
std
::
vector
<
T
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
array_
[
i
]
=
arr
[
i
];
}
size_
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
SimpleVector
(
const
SimpleVector
<
T
,
MaxDim
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
array_
[
i
]
=
arr
[
i
];
}
size_
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
size_
);
#endif
return
array_
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
size_
);
#endif
return
array_
[
idx
];
}
TV_HOST_DEVICE_INLINE
void
push_back
(
T
s
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
size_
<
MaxDim
);
#endif
array_
[
size_
]
=
s
;
size_
++
;
}
TV_HOST_DEVICE_INLINE
void
pop_back
()
{
#ifdef TV_DEBUG
TV_ASSERT
(
size_
>
0
);
#endif
size_
--
;
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
size_
;
}
TV_HOST_DEVICE_INLINE
const
T
*
data
()
const
{
return
array_
;
}
TV_HOST_DEVICE_INLINE
T
*
data
()
{
return
array_
;
}
TV_HOST_DEVICE_INLINE
size_t
empty
()
const
{
return
size_
==
0
;
}
typedef
size_t
size_type
;
class
iterator
{
public:
typedef
iterator
self_type
;
typedef
T
value_type
;
typedef
T
&
reference
;
typedef
T
*
pointer
;
typedef
std
::
forward_iterator_tag
iterator_category
;
typedef
std
::
ptrdiff_t
difference_type
;
TV_HOST_DEVICE_INLINE
iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
const
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
const
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
class
const_iterator
{
public:
typedef
const_iterator
self_type
;
typedef
T
value_type
;
typedef
const
T
&
reference
;
typedef
const
T
*
pointer
;
typedef
std
::
ptrdiff_t
difference_type
;
typedef
std
::
forward_iterator_tag
iterator_category
;
TV_HOST_DEVICE_INLINE
const_iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
const
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
const
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
TV_HOST_DEVICE_INLINE
iterator
begin
()
{
return
iterator
(
array_
);
}
TV_HOST_DEVICE_INLINE
iterator
end
()
{
return
iterator
(
array_
+
size_
);
}
TV_HOST_DEVICE_INLINE
const_iterator
begin
()
const
{
return
const_iterator
(
array_
);
}
TV_HOST_DEVICE_INLINE
const_iterator
end
()
const
{
return
const_iterator
(
array_
+
size_
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cbegin
()
const
{
return
const_iterator
(
array_
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cend
()
const
{
return
const_iterator
(
array_
+
size_
);
}
protected:
T
array_
[
MaxDim
];
size_t
size_
=
0
;
};
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
==
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
if
(
lfs
.
size
()
!=
rfs
.
size
())
return
false
;
for
(
size_t
i
=
0
;
i
<
lfs
.
size
();
++
i
)
{
if
(
lfs
[
i
]
!=
rfs
[
i
])
return
false
;
}
return
true
;
}
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
!=
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
return
!
(
lfs
==
rfs
);
}
struct
Slice
{
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
Slice
(
Integers
...
ints
)
{
static_assert
(
sizeof
...(
ints
)
<=
3
,
"slice init must smaller than 3"
);
SimpleVector
<
int
,
3
>
slices
{
int
(
ints
)...};
slices_
[
0
]
=
-
1
;
slices_
[
1
]
=
-
1
;
slices_
[
2
]
=
-
1
;
for
(
size_t
i
=
0
;
i
<
slices
.
size
();
++
i
)
{
slices_
[
i
]
=
slices
[
i
];
}
}
TV_HOST_DEVICE_INLINE
Slice
()
{
slices_
[
0
]
=
-
1
;
slices_
[
1
]
=
-
1
;
slices_
[
2
]
=
-
1
;
}
template
<
typename
T
>
TV_HOST_DEVICE_INLINE
Slice
(
std
::
initializer_list
<
T
>
slice
)
{
slices_
[
0
]
=
-
1
;
slices_
[
1
]
=
-
1
;
slices_
[
2
]
=
-
1
;
TV_ASSERT
(
slice
.
size
()
<=
3
);
int
idx
=
0
;
for
(
T
s
:
slice
)
{
slices_
[
idx
]
=
int
(
s
);
++
idx
;
}
}
TV_HOST_DEVICE_INLINE
int
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
slices_
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
int
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
slices_
[
idx
];
}
protected:
int
slices_
[
3
];
};
template
<
size_t
MaxDim
=
TV_MAX_DIM
,
typename
Tindex
=
int
>
struct
ShapeBase
:
public
SimpleVector
<
Tindex
,
MaxDim
>
{
TV_HOST_DEVICE_INLINE
ShapeBase
()
:
SimpleVector
<
Tindex
,
MaxDim
>
(){};
TV_HOST_DEVICE_INLINE
ShapeBase
(
std
::
initializer_list
<
Tindex
>
shape
)
:
SimpleVector
<
Tindex
,
MaxDim
>
(
shape
)
{}
TV_HOST_DEVICE_INLINE
ShapeBase
(
SimpleVector
<
Tindex
,
MaxDim
>
vec
)
:
SimpleVector
<
Tindex
,
MaxDim
>
(
vec
)
{}
template
<
typename
T
,
template
<
class
...
>
class
Container
>
ShapeBase
(
Container
<
T
>
shape
)
:
SimpleVector
<
Tindex
,
MaxDim
>
(
shape
)
{}
TV_HOST_DEVICE_INLINE
ShapeBase
(
const
ShapeBase
<
MaxDim
>
&
shape
)
:
SimpleVector
<
Tindex
,
MaxDim
>
(
shape
)
{}
ShapeBase
(
const
std
::
vector
<
Tindex
>
&
arr
)
:
SimpleVector
<
Tindex
,
MaxDim
>
(
arr
)
{}
ShapeBase
<
MaxDim
,
Tindex
>
&
operator
=
(
const
ShapeBase
<
MaxDim
,
Tindex
>
&
shape
)
=
default
;
TV_HOST_DEVICE
ShapeBase
<
MaxDim
,
Tindex
>
subshape
(
Tindex
start
,
Tindex
end
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
end
<=
this
->
size_
&&
end
>
start
);
#endif
ShapeBase
<
MaxDim
,
Tindex
>
shape
;
for
(
Tindex
i
=
start
;
i
<
end
;
++
i
)
{
shape
.
push_back
(
this
->
array_
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE
ShapeBase
<
MaxDim
,
Tindex
>
subshape
(
Tindex
start
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
start
<=
this
->
size_
);
#endif
ShapeBase
<
MaxDim
,
Tindex
>
shape
;
for
(
size_t
i
=
start
;
i
<
this
->
size_
;
++
i
)
{
shape
.
push_back
(
this
->
array_
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE
size_t
size
()
const
{
if
(
this
->
size_
==
0
)
return
0
;
size_t
s
=
1
;
for
(
int
i
=
0
;
i
<
int
(
this
->
size_
);
++
i
)
{
s
*=
this
->
array_
[
i
];
}
return
s
;
}
TV_HOST_DEVICE_INLINE
size_t
ndim
()
const
{
return
this
->
size_
;
}
TV_HOST_DEVICE
ShapeBase
<
MaxDim
,
Tindex
>
squeeze
()
const
{
ShapeBase
<
MaxDim
,
Tindex
>
shape
;
for
(
size_t
i
=
0
;
i
<
this
->
size_
;
++
i
)
{
if
(
this
->
array_
[
i
]
!=
1
)
shape
.
push_back
(
this
->
array_
[
i
]);
}
if
(
shape
.
empty
())
{
// dont support empty shape for now
shape
.
push_back
(
1
);
}
return
shape
;
}
template
<
size_t
MaxDim2
=
MaxDim
>
TV_HOST_DEVICE
ShapeBase
<
MaxDim2
,
Tindex
>
squeeze
(
int
dim
)
const
{
static_assert
(
MaxDim2
>=
MaxDim
-
1
,
"error"
);
ShapeBase
<
MaxDim2
,
Tindex
>
shape
;
for
(
size_t
i
=
0
;
i
<
this
->
size_
;
++
i
)
{
if
(
i
!=
size_t
(
dim
)
||
this
->
array_
[
i
]
!=
1
)
shape
.
push_back
(
this
->
array_
[
i
]);
}
return
shape
;
}
template
<
size_t
MaxDim2
=
MaxDim
>
TV_HOST_DEVICE
ShapeBase
<
MaxDim2
,
Tindex
>
unsqueeze
(
int
dim
)
const
{
static_assert
(
MaxDim2
>=
MaxDim
-
1
,
"error"
);
ShapeBase
<
MaxDim2
,
Tindex
>
shape
;
for
(
size_t
i
=
0
;
i
<
this
->
size_
;
++
i
)
{
if
(
i
==
size_t
(
dim
))
shape
.
push_back
(
1
);
shape
.
push_back
(
this
->
array_
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE
size_t
prod
(
Tindex
start
=
0
)
const
{
size_t
res
=
1
;
for
(
size_t
i
=
start
;
i
<
this
->
size_
;
++
i
)
{
res
*=
this
->
array_
[
i
];
}
return
res
;
}
template
<
size_t
MaxDim2
=
MaxDim
>
TV_HOST_DEVICE
ShapeBase
<
MaxDim2
,
Tindex
>
stride_rowmajor
()
{
static_assert
(
MaxDim2
>=
MaxDim
,
"error"
);
Tindex
p
=
Tindex
(
1
);
ShapeBase
<
MaxDim2
,
Tindex
>
res
(
this
->
size_
);
for
(
Tindex
i
=
this
->
size_
-
1
;
i
>=
0
;
--
i
)
{
res
[
i
]
=
p
;
p
*=
this
->
array_
[
i
];
}
return
res
;
}
};
using
Shape
=
ShapeBase
<
TV_MAX_DIM
,
int
>
;
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#ifdef TV_DEBUG
TV_ASSERT
(
sizeof
...(
indexes
)
==
shape
.
size
());
#endif
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
std
::
vector
<
int
>
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
shape
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
const
Shape
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
indexes_vec
.
ndim
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Index
*
indexes
,
const
Index
*
shape
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
Index
rowArrayIdxInv
(
Index
index
,
Index
*
output
,
const
Index
*
shape
)
{
#pragma unroll
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
output
[
i
]
=
index
%
shape
[
i
];
index
-=
output
[
i
];
index
/=
shape
[
i
];
}
return
index
;
}
template
<
typename
Index
>
TV_HOST_DEVICE
Index
rowArrayIdxInv
(
Index
index
,
Index
*
output
,
const
Index
*
shape
,
int
ndim
)
{
for
(
int
i
=
ndim
-
1
;
i
>=
0
;
--
i
)
{
output
[
i
]
=
index
%
shape
[
i
];
index
-=
output
[
i
];
index
/=
shape
[
i
];
}
return
index
;
}
template
<
int
N
>
struct
ArrayIndexRowMajorReverse
{
template
<
typename
TShape
,
typename
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
shape
,
T
index
,
Ts
...
inds
)
{
return
index
+
shape
[
N
-
1
]
*
ArrayIndexRowMajorReverse
<
N
-
1
>::
run
(
shape
,
inds
...);
}
template
<
typename
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
static
unsigned
runShape
(
const
Shape
&
shape
,
T
index
,
Ts
...
inds
)
{
return
index
+
shape
[
N
-
1
]
*
ArrayIndexRowMajorReverse
<
N
-
1
>::
run
(
shape
,
inds
...);
}
};
template
<
>
struct
ArrayIndexRowMajorReverse
<
1
>
{
template
<
typename
TShape
,
typename
T
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
shape
,
T
idx
)
{
return
idx
;
}
template
<
typename
T
>
TV_HOST_DEVICE_INLINE
static
unsigned
runShape
(
const
Shape
&
shape
,
T
idx
)
{
return
idx
;
}
};
template
<
int
N
,
int
Ndim
>
struct
ArrayIndexRowMajor
{
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template
<
typename
TShape
,
typename
Tinit
,
typename
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
shape
,
Tinit
start
,
T
index
,
Ts
...
inds
)
{
return
ArrayIndexRowMajor
<
N
-
1
,
Ndim
>::
run
(
shape
,
(
index
+
start
)
*
shape
[
Ndim
-
N
+
1
],
inds
...);
}
template
<
typename
Tinit
,
typename
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
static
unsigned
runShape
(
const
Shape
&
shape
,
Tinit
start
,
T
index
,
Ts
...
inds
)
{
return
ArrayIndexRowMajor
<
N
-
1
,
Ndim
>::
runShape
(
shape
,
(
index
+
start
)
*
shape
[
Ndim
-
N
+
1
],
inds
...);
}
template
<
typename
TShape
,
typename
Tinit
>
TV_HOST_DEVICE_INLINE
static
unsigned
runPtrs
(
const
TShape
*
indexes
,
const
TShape
*
shape
,
Tinit
start
)
{
return
ArrayIndexRowMajor
<
N
-
1
,
Ndim
>::
runPtrs
(
indexes
,
shape
,
(
indexes
[
Ndim
-
N
]
+
start
)
*
shape
[
Ndim
-
N
+
1
]);
}
};
template
<
int
Ndim
>
struct
ArrayIndexRowMajor
<
1
,
Ndim
>
{
template
<
typename
TShape
,
typename
Tinit
,
typename
T
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
shape
,
Tinit
start
,
T
idx
)
{
return
start
+
idx
;
}
template
<
typename
Tinit
,
typename
T
>
TV_HOST_DEVICE_INLINE
static
unsigned
runShape
(
const
Shape
&
shape
,
Tinit
start
,
T
idx
)
{
return
start
+
idx
;
}
template
<
typename
TShape
,
typename
Tinit
>
TV_HOST_DEVICE_INLINE
static
unsigned
runPtrs
(
const
TShape
*
indexes
,
const
TShape
*
shape
,
Tinit
start
)
{
return
start
+
indexes
[
Ndim
-
1
];
}
};
template
<
>
struct
ArrayIndexRowMajor
<
0
,
0
>
{
template
<
typename
TShape
,
typename
Tinit
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
shape
,
Tinit
start
)
{
return
0
;
}
template
<
typename
Tinit
>
TV_HOST_DEVICE_INLINE
static
unsigned
runShape
(
const
Shape
&
shape
,
Tinit
start
)
{
return
0
;
}
template
<
typename
TShape
,
typename
Tinit
>
TV_HOST_DEVICE_INLINE
static
unsigned
runPtrs
(
const
TShape
*
indexes
,
const
TShape
*
shape
,
Tinit
start
)
{
return
0
;
}
};
template
<
int
N
,
int
Ndim
>
struct
ArrayIndexStride
{
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template
<
typename
TShape
,
typename
Tinit
,
typename
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
stride
,
Tinit
start
,
T
index
,
Ts
...
inds
)
{
return
ArrayIndexStride
<
N
-
1
,
Ndim
>::
run
(
stride
,
start
+
index
*
stride
[
Ndim
-
N
+
1
],
inds
...);
}
};
template
<
int
Ndim
>
struct
ArrayIndexStride
<
1
,
Ndim
>
{
template
<
typename
TShape
,
typename
Tinit
,
typename
T
>
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
TShape
*
stride
,
Tinit
start
,
T
idx
)
{
return
start
+
idx
*
stride
[
Ndim
-
1
];
}
};
#if __cplusplus >= 201703L
template
<
size_t
...
N
,
class
T
,
class
...
Ts
>
TV_HOST_DEVICE_INLINE
T
array_index_stride
(
const
T
*
stride
,
Ts
...
ids
)
{
return
((
stride
[
N
]
*
std
::
get
<
N
>
(
std
::
forward_as_tuple
(
ids
...)))
+
...);
}
#endif
namespace
detail
{
template
<
typename
T
>
struct
TypeToString
;
template
<
>
struct
TypeToString
<
bool
>
{
static
constexpr
const
char
*
value
=
"bool"
;
};
template
<
>
struct
TypeToString
<
const
bool
>
{
static
constexpr
const
char
*
value
=
"bool"
;
};
template
<
>
struct
TypeToString
<
int32_t
>
{
static
constexpr
const
char
*
value
=
"int32"
;
};
template
<
>
struct
TypeToString
<
float
>
{
static
constexpr
const
char
*
value
=
"float"
;
};
template
<
>
struct
TypeToString
<
double
>
{
static
constexpr
const
char
*
value
=
"double"
;
};
template
<
>
struct
TypeToString
<
int16_t
>
{
static
constexpr
const
char
*
value
=
"int16"
;
};
template
<
>
struct
TypeToString
<
int8_t
>
{
static
constexpr
const
char
*
value
=
"int8"
;
};
template
<
>
struct
TypeToString
<
int64_t
>
{
static
constexpr
const
char
*
value
=
"int64"
;
};
template
<
>
struct
TypeToString
<
uint8_t
>
{
static
constexpr
const
char
*
value
=
"uint8"
;
};
template
<
>
struct
TypeToString
<
uint16_t
>
{
static
constexpr
const
char
*
value
=
"uint16"
;
};
template
<
>
struct
TypeToString
<
uint32_t
>
{
static
constexpr
const
char
*
value
=
"uint32"
;
};
template
<
>
struct
TypeToString
<
uint64_t
>
{
static
constexpr
const
char
*
value
=
"uint64"
;
};
template
<
>
struct
TypeToString
<
const
int32_t
>
{
static
constexpr
const
char
*
value
=
"int32"
;
};
template
<
>
struct
TypeToString
<
const
float
>
{
static
constexpr
const
char
*
value
=
"float"
;
};
template
<
>
struct
TypeToString
<
const
double
>
{
static
constexpr
const
char
*
value
=
"double"
;
};
template
<
>
struct
TypeToString
<
const
int16_t
>
{
static
constexpr
const
char
*
value
=
"int16"
;
};
template
<
>
struct
TypeToString
<
const
int8_t
>
{
static
constexpr
const
char
*
value
=
"int8"
;
};
template
<
>
struct
TypeToString
<
const
int64_t
>
{
static
constexpr
const
char
*
value
=
"int64"
;
};
template
<
>
struct
TypeToString
<
const
uint8_t
>
{
static
constexpr
const
char
*
value
=
"uint8"
;
};
template
<
>
struct
TypeToString
<
const
uint16_t
>
{
static
constexpr
const
char
*
value
=
"uint16"
;
};
template
<
>
struct
TypeToString
<
const
uint32_t
>
{
static
constexpr
const
char
*
value
=
"uint32"
;
};
template
<
>
struct
TypeToString
<
const
uint64_t
>
{
static
constexpr
const
char
*
value
=
"uint64"
;
};
}
// namespace detail
template
<
typename
T
>
constexpr
const
char
*
type_s
=
detail
::
TypeToString
<
T
>::
value
;
namespace
detail
{
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
=
DefaultPtrTraits
,
typename
Tindex
=
int
>
struct
TensorAccesserBase
{
static
constexpr
int
rank_value
=
Rank
;
using
ptr_t
=
typename
PtrTraits
<
T
>::
type
;
static_assert
(
Rank
>
0
,
"error"
);
explicit
TV_HOST_DEVICE_INLINE
TensorAccesserBase
(
ptr_t
ptr
,
const
Tindex
*
stride_ptr
)
:
ptr_
(
ptr
),
stride_ptr_
(
stride_ptr
)
{}
TV_HOST_DEVICE_INLINE
ptr_t
data
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
const
ptr_t
data
()
const
{
return
ptr_
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
Inds
...
inds
)
{
static_assert
(
sizeof
...(
inds
)
==
Rank
,
"error"
);
return
ptr_
[
ArrayIndexStride
<
Rank
,
Rank
>::
run
(
stride_ptr_
,
0
,
inds
...)];
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
Inds
...
inds
)
const
{
static_assert
(
sizeof
...(
inds
)
==
Rank
,
"error"
);
return
ptr_
[
ArrayIndexStride
<
Rank
,
Rank
>::
run
(
stride_ptr_
,
0
,
inds
...)];
}
protected:
ptr_t
ptr_
;
const
Tindex
*
stride_ptr_
;
};
}
// namespace detail
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
=
DefaultPtrTraits
,
typename
Tindex
=
int
>
struct
TensorAccesser
:
public
detail
::
TensorAccesserBase
<
T
,
Rank
,
PtrTraits
,
Tindex
>
{
using
ptr_t
=
typename
PtrTraits
<
T
>::
type
;
static_assert
(
Rank
>
0
,
"error"
);
explicit
TV_HOST_DEVICE_INLINE
TensorAccesser
(
ptr_t
ptr
,
const
Tindex
*
stride_ptr
)
:
detail
::
TensorAccesserBase
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr
,
stride_ptr
)
{}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
operator
[](
int
i
)
{
return
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
(
this
->
ptr_
+
this
->
stride_ptr_
[
0
]
*
i
,
this
->
stride_ptr_
+
1
);
}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
operator
[](
int
i
)
const
{
return
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
(
this
->
ptr_
+
this
->
stride_ptr_
[
0
]
*
i
,
this
->
stride_ptr_
+
1
);
}
};
template
<
typename
T
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
struct
TensorAccesser
<
T
,
1
,
PtrTraits
,
Tindex
>
:
public
detail
::
TensorAccesserBase
<
T
,
1
,
PtrTraits
,
Tindex
>
{
using
ptr_t
=
typename
PtrTraits
<
T
>::
type
;
explicit
TV_HOST_DEVICE_INLINE
TensorAccesser
(
ptr_t
ptr
,
const
Tindex
*
stride_ptr
)
:
detail
::
TensorAccesserBase
<
T
,
1
,
PtrTraits
,
Tindex
>
(
ptr
,
stride_ptr
)
{}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
i
)
{
return
this
->
ptr_
[
this
->
stride_ptr_
[
0
]
*
i
];
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
i
)
const
{
return
this
->
ptr_
[
this
->
stride_ptr_
[
0
]
*
i
];
}
};
template
<
typename
T
,
int
Rank
=
-
1
,
template
<
class
>
class
PtrTraits
=
DefaultPtrTraits
,
typename
Tindex
=
int
>
struct
TensorView
{
static
constexpr
int
rank_value
=
Rank
;
using
ptr_t
=
typename
PtrTraits
<
T
>::
type
;
using
tv_shape_t
=
ShapeBase
<
Rank
==
-
1
?
TV_MAX_DIM
:
Rank
,
Tindex
>
;
using
no_cv_type
=
typename
std
::
remove_cv
<
T
>::
type
;
static_assert
(
Rank
==
-
1
||
Rank
>
0
,
"error"
);
TV_HOST_DEVICE_INLINE
TensorView
()
{}
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
ptr_t
ptr
,
tv_shape_t
shape
)
:
ptr_
(
ptr
),
shape_
(
shape
),
stride_
(
shape
.
stride_rowmajor
())
{}
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
ptr_t
ptr
,
tv_shape_t
shape
,
tv_shape_t
stride
)
:
ptr_
(
ptr
),
shape_
(
shape
),
stride_
(
stride
)
{}
operator
TensorView
<
const
no_cv_type
,
Rank
,
PtrTraits
,
Tindex
>
()
{
return
TensorView
<
const
no_cv_type
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
,
shape_
);
}
// conversion function
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
Inds
...
inds
)
{
static_assert
(
Rank
==
-
1
||
sizeof
...(
inds
)
==
Rank
,
"error"
);
#if defined TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
shape_
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
shape_
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
shape_
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
shape_
[
i
]);
}
#endif
constexpr
int
Ndim
=
sizeof
...(
Inds
);
return
ptr_
[
ArrayIndexRowMajor
<
Ndim
,
Ndim
>::
runShape
(
shape_
,
0
,
inds
...)];
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
Inds
...
inds
)
const
{
static_assert
(
Rank
==
-
1
||
sizeof
...(
inds
)
==
Rank
,
"error"
);
#if defined TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
shape_
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
shape_
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
shape_
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
shape_
[
i
]);
}
#endif
constexpr
int
Ndim
=
sizeof
...(
Inds
);
return
ptr_
[
ArrayIndexRowMajor
<
Ndim
,
Ndim
>::
runShape
(
shape_
,
0
,
inds
...)];
}
TV_HOST_DEVICE_INLINE
T
&
operator
()()
{
static_assert
(
Rank
==
-
1
||
0
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
ptr_
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
shape_
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
#endif
return
ptr_
[
0
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
()()
const
{
static_assert
(
Rank
==
-
1
||
0
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
ptr_
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
shape_
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
#endif
return
ptr_
[
0
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
)
{
static_assert
(
Rank
==
-
1
||
1
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
i1
,
shape_
[
0
]);
#endif
return
ptr_
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
)
{
static_assert
(
Rank
==
-
1
||
2
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
#endif
return
ptr_
[
i1
*
shape_
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
{
static_assert
(
Rank
==
-
1
||
3
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
shape_
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
shape_
[
2
]);
#endif
return
ptr_
[(
i1
*
shape_
[
1
]
+
i2
)
*
shape_
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
{
static_assert
(
Rank
==
-
1
||
4
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
shape_
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
shape_
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
shape_
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
shape_
[
3
]);
#endif
return
ptr_
[((
i1
*
shape_
[
1
]
+
i2
)
*
shape_
[
2
]
+
i3
)
*
shape_
[
3
]
+
i4
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
)
const
{
static_assert
(
Rank
==
-
1
||
1
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
#endif
return
ptr_
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
)
const
{
static_assert
(
Rank
==
-
1
||
2
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
#endif
return
ptr_
[
i1
*
shape_
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
const
{
static_assert
(
Rank
==
-
1
||
3
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
shape_
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
shape_
[
2
]);
#endif
return
ptr_
[(
i1
*
shape_
[
1
]
+
i2
)
*
shape_
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
const
{
static_assert
(
Rank
==
-
1
||
4
==
Rank
,
"error"
);
#if defined TV_DEBUG
TV_REQUIRE
(
shape_
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
shape_
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
shape_
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
shape_
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
shape_
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
shape_
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
shape_
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
shape_
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
shape_
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
shape_
[
3
]);
#endif
return
ptr_
[((
i1
*
shape_
[
1
]
+
i2
)
*
shape_
[
2
]
+
i3
)
*
shape_
[
3
]
+
i4
];
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#endif
return
ptr_
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#endif
return
ptr_
[
idx
];
}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
accessor
(
Tindex
idx
)
{
static_assert
(
Rank
>
1
,
"for Rank == 1, use accessor() or just use []"
);
return
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
(
ptr_
+
stride_
[
0
]
*
idx
,
stride_
.
data
()
+
1
);
}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
,
PtrTraits
,
Tindex
>
accessor
()
{
static_assert
(
Rank
>
0
,
"rank must higher than zero"
);
return
TensorAccesser
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
,
stride_
.
data
());
}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
accessor
(
Tindex
idx
)
const
{
static_assert
(
Rank
>
1
,
"for Rank == 1, use accessor() or just use []"
);
return
TensorAccesser
<
T
,
Rank
-
1
,
PtrTraits
,
Tindex
>
(
ptr_
+
stride_
[
0
]
*
idx
,
stride_
.
data
()
+
1
);
}
TV_HOST_DEVICE_INLINE
TensorAccesser
<
T
,
Rank
,
PtrTraits
,
Tindex
>
accessor
()
const
{
static_assert
(
Rank
>
0
,
"error"
);
return
TensorAccesser
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
,
stride_
.
data
(),
"rank must higher than zero"
);
}
TV_HOST_DEVICE_INLINE
bool
empty
()
const
{
return
ptr_
==
nullptr
;
}
TV_HOST_DEVICE_INLINE
ptr_t
data
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
const
ptr_t
data
()
const
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
const
tv_shape_t
&
shape
()
const
{
return
shape_
;
}
TV_HOST_DEVICE_INLINE
const
tv_shape_t
&
stride
()
const
{
return
stride_
;
}
TV_HOST_DEVICE_INLINE
int
dim
(
int
idx
)
const
{
return
shape_
[
idx
];
}
TV_HOST_DEVICE_INLINE
int
ndim
()
const
{
return
shape_
.
ndim
();
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
==
-
1
?
-
1
:
sizeof
...(
Inds
),
PtrTraits
,
Tindex
>
view
(
Inds
...
newShapes
)
const
{
ShapeBase
<
Rank
==
-
1
?
TV_MAX_DIM
:
sizeof
...(
Inds
),
Tindex
>
shapes
{
int
(
newShapes
)...};
for
(
size_t
i
=
0
;
i
<
sizeof
...(
newShapes
);
++
i
)
{
if
(
shapes
[
i
]
==
-
1
)
{
shapes
[
i
]
=
1
;
shapes
[
i
]
=
size
()
/
shapes
.
size
();
break
;
}
}
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
Rank
==
-
1
?
-
1
:
sizeof
...(
Inds
),
PtrTraits
,
Tindex
>
(
ptr_
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
view
(
Shape
shapes
)
const
{
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
(
ptr_
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
squeeze
()
const
{
return
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
(
ptr_
,
shape_
.
squeeze
());
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
==
-
1
?
-
1
:
Rank
-
1
,
PtrTraits
,
Tindex
>
squeeze
(
int
dim
)
const
{
return
TensorView
<
T
,
Rank
==
-
1
?
-
1
:
Rank
-
1
,
PtrTraits
,
Tindex
>
(
ptr_
,
shape_
.
squeeze
<
Rank
==
-
1
?
TV_MAX_DIM
:
Rank
-
1
>
(
dim
));
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
shape_
.
size
();
}
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
subview
(
int
id
,
Integers
...
ints
)
{
tv_shape_t
start
=
{
id
,
ints
...};
for
(
int
i
=
1
+
sizeof
...(
ints
);
i
<
ndim
();
++
i
)
{
start
.
push_back
(
0
);
}
return
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
+
rowArrayIdx
(
shape_
,
start
),
shape_
.
subshape
(
sizeof
...(
ints
)
+
1
));
}
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
subview
(
int
id
,
Integers
...
ints
)
const
{
tv_shape_t
start
=
{
id
,
ints
...};
for
(
int
i
=
1
+
sizeof
...(
ints
);
i
<
ndim
();
++
i
)
{
start
.
push_back
(
0
);
}
return
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
+
rowArrayIdx
(
shape_
,
start
),
shape_
.
subshape
(
sizeof
...(
ints
)
+
1
));
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
-
1
,
PtrTraits
,
Tindex
>
subview
(
SimpleVector
<
int
>
ids
)
const
{
Shape
start
=
ids
;
for
(
int
i
=
ids
.
size
();
i
<
ndim
();
++
i
)
{
start
.
push_back
(
0
);
}
return
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
ptr_
+
rowArrayIdx
(
shape_
,
start
),
shape_
.
subshape
(
ids
.
size
()));
}
template
<
typename
Os
>
std
::
string
repr
(
Os
&
ss
)
const
{
if
(
empty
())
return
""
;
if
(
shape_
.
ndim
()
==
0
)
{
ss
<<
"Tensor["
<<
type_s
<
T
>
<<
"]"
<<
std
::
endl
;
ss
<<
*
ptr_
;
return
ss
.
str
();
}
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
prev
(
ndim
(),
-
1
);
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
nd_index
(
ndim
());
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
_shape
;
for
(
auto
s
:
shape
())
{
_shape
.
push_back
(
s
);
}
ss
<<
"Tensor["
<<
type_s
<
T
>
<<
"]: shape="
<<
shape
()
<<
", stride="
<<
stride
()
<<
std
::
endl
;
auto
ndimValue
=
ndim
();
for
(
int64_t
i
=
0
;
i
<
int64_t
(
size
());
++
i
)
{
rowArrayIdxInv
(
i
,
nd_index
.
data
(),
_shape
.
data
(),
ndimValue
);
bool
newline
=
false
;
int
end_count
=
0
;
for
(
int
j
=
0
;
j
<
ndimValue
;
++
j
)
{
if
(
nd_index
[
j
]
!=
prev
[
j
]
&&
nd_index
[
j
]
==
0
&&
prev
[
j
]
!=
0
&&
prev
[
j
]
!=
-
1
)
{
ss
<<
"]"
;
++
end_count
;
newline
=
true
;
}
}
if
(
prev
[
0
]
==
-
1
)
{
end_count
=
ndimValue
;
}
if
(
newline
)
{
ss
<<
"
\n
"
;
}
int
starts_count
=
0
;
for
(
int
j
=
0
;
j
<
ndimValue
;
++
j
)
{
if
(
nd_index
[
j
]
!=
prev
[
j
]
&&
nd_index
[
j
]
==
0
&&
prev
[
j
]
!=
0
)
{
++
starts_count
;
}
}
if
(
starts_count
>
0
)
{
for
(
int
j
=
0
;
j
<
ndimValue
-
end_count
;
++
j
)
{
ss
<<
" "
;
}
for
(
int
j
=
0
;
j
<
starts_count
;
++
j
)
{
ss
<<
"["
;
}
}
if
(
std
::
is_same
<
T
,
uint8_t
>::
value
||
std
::
is_same
<
T
,
const
uint8_t
>::
value
)
{
ss
<<
unsigned
((
*
this
)[
i
]);
}
else
{
ss
<<
(
*
this
)[
i
];
}
if
(
nd_index
[
ndimValue
-
1
]
!=
_shape
[
ndimValue
-
1
]
-
1
)
{
ss
<<
","
;
}
for
(
int
j
=
0
;
j
<
ndimValue
;
++
j
)
{
prev
[
j
]
=
nd_index
[
j
];
}
}
for
(
int
j
=
0
;
j
<
ndimValue
;
++
j
)
{
ss
<<
"]"
;
}
return
ss
.
str
();
}
std
::
string
repr
()
const
{
std
::
ostringstream
ss
;
return
repr
(
ss
);
}
protected:
template
<
typename
T1
>
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
T1
s
)
const
{
return
Slice
{
int
(
s
),
-
1
,
-
1
};
}
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
Slice
s
)
const
{
return
Slice
(
s
);
}
ptr_t
ptr_
=
nullptr
;
tv_shape_t
shape_
;
tv_shape_t
stride_
;
};
template
<
typename
T
>
TensorView
<
T
>
vector2tv
(
std
::
vector
<
T
>
&
arr
)
{
return
TensorView
<
T
>
(
arr
.
data
(),
{
arr
.
size
()});
}
template
<
typename
T
>
TensorView
<
T
>
vector2tv
(
std
::
vector
<
T
>
&
arr
,
Shape
shape
)
{
TV_ASSERT_INVALID_ARG
(
shape
.
prod
()
==
arr
.
size
(),
"error"
);
return
TensorView
<
T
>
(
arr
.
data
(),
shape
);
}
template
<
typename
T
>
TensorView
<
const
T
>
vector2tv
(
const
std
::
vector
<
T
>
&
arr
)
{
return
TensorView
<
const
T
>
(
arr
.
data
(),
{
arr
.
size
()});
}
template
<
typename
Os
,
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
template
<
typename
Os
,
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits
,
Tindex
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
namespace
detail
{
template
<
typename
T
>
struct
TypePrintfFormat
;
template
<
>
struct
TypePrintfFormat
<
float
>
{
static
constexpr
const
char
*
value
=
"%.2f"
;
};
template
<
>
struct
TypePrintfFormat
<
double
>
{
static
constexpr
const
char
*
value
=
"%.2f"
;
};
template
<
>
struct
TypePrintfFormat
<
int8_t
>
{
static
constexpr
const
char
*
value
=
"%d"
;
};
template
<
>
struct
TypePrintfFormat
<
int16_t
>
{
static
constexpr
const
char
*
value
=
"%d"
;
};
template
<
>
struct
TypePrintfFormat
<
int32_t
>
{
static
constexpr
const
char
*
value
=
"%d"
;
};
template
<
>
struct
TypePrintfFormat
<
uint8_t
>
{
static
constexpr
const
char
*
value
=
"%u"
;
};
template
<
>
struct
TypePrintfFormat
<
uint16_t
>
{
static
constexpr
const
char
*
value
=
"%u"
;
};
template
<
>
struct
TypePrintfFormat
<
uint32_t
>
{
static
constexpr
const
char
*
value
=
"%u"
;
};
template
<
>
struct
TypePrintfFormat
<
int64_t
>
{
static
constexpr
const
char
*
value
=
"%ld"
;
};
template
<
>
struct
TypePrintfFormat
<
uint64_t
>
{
static
constexpr
const
char
*
value
=
"%lu"
;
};
template
<
>
struct
TypePrintfFormat
<
bool
>
{
static
constexpr
const
char
*
value
=
"%d"
;
};
template
<
typename
T
>
constexpr
const
char
*
type_printf_format_v
=
TypePrintfFormat
<
T
>::
value
;
};
// namespace detail
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
TV_HOST_DEVICE
void
printTensorView
(
const
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
&
tensor
,
const
char
*
format
)
{
// used to print tensor in cuda kernel.
if
(
tensor
.
empty
())
return
;
if
(
tensor
.
ndim
()
==
0
)
{
printf
(
format
,
tensor
());
printf
(
"
\n
"
);
return
;
}
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
prev
(
tensor
.
ndim
(),
-
1
);
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
nd_index
(
tensor
.
ndim
());
SimpleVector
<
int64_t
,
TV_MAX_DIM
>
shape
(
tensor
.
shape
());
auto
ndim
=
tensor
.
ndim
();
for
(
int64_t
i
=
0
;
i
<
tensor
.
size
();
++
i
)
{
rowArrayIdxInv
(
i
,
nd_index
.
data
(),
shape
.
data
(),
ndim
);
bool
newline
=
false
;
int
end_count
=
0
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
if
(
nd_index
[
j
]
!=
prev
[
j
]
&&
nd_index
[
j
]
==
0
&&
prev
[
j
]
!=
0
&&
prev
[
j
]
!=
-
1
)
{
printf
(
"]"
);
++
end_count
;
newline
=
true
;
}
}
if
(
prev
[
0
]
==
-
1
)
{
end_count
=
ndim
;
}
if
(
newline
)
{
printf
(
"
\n
"
);
}
int
starts_count
=
0
;
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
if
(
nd_index
[
j
]
!=
prev
[
j
]
&&
nd_index
[
j
]
==
0
&&
prev
[
j
]
!=
0
)
{
++
starts_count
;
}
}
if
(
starts_count
>
0
)
{
for
(
int
j
=
0
;
j
<
ndim
-
end_count
;
++
j
)
{
printf
(
" "
);
}
for
(
int
j
=
0
;
j
<
starts_count
;
++
j
)
{
printf
(
"]"
);
}
}
printf
(
format
,
tensor
[
i
]);
if
(
nd_index
[
ndim
-
1
]
!=
shape
[
ndim
-
1
]
-
1
)
{
printf
(
","
);
}
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
prev
[
j
]
=
nd_index
[
j
];
}
}
for
(
int
j
=
0
;
j
<
ndim
;
++
j
)
{
printf
(
"]"
);
}
printf
(
"
\n
"
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
TV_HOST_DEVICE
void
printTensorView
(
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
tensor
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
tensor
,
detail
::
type_printf_format_v
<
Traw
>
);
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
detail
::
type_printf_format_v
<
Traw
>
);
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
,
const
char
*
format
)
{
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
format
);
}
#ifdef TV_CUDA
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
fprintf
(
stderr
,
"CUDA error at %s:%d code=%d
\"
%s
\"
\n
"
,
file
,
line
,
static_cast
<
unsigned
int
>
(
result
),
func
);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit
(
EXIT_FAILURE
);
}
}
#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
void
host2dev
(
T
*
dst
,
const
T
*
src
,
size_t
size
,
cudaStream_t
s
=
0
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
dst
,
src
,
size
*
sizeof
(
T
),
cudaMemcpyHostToDevice
,
s
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2dev
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
host2dev
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2dev
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
host2dev
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
>
void
host2dev_sync
(
T
*
dst
,
const
T
*
src
,
size_t
size
)
{
checkCudaErrors
(
cudaMemcpy
(
dst
,
src
,
size
*
sizeof
(
T
),
cudaMemcpyHostToDevice
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2dev_sync
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
)
{
host2dev_sync
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2dev_sync
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
)
{
host2dev_sync
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()));
}
template
<
typename
T
>
void
dev2host
(
T
*
dst
,
const
T
*
src
,
size_t
size
,
cudaStream_t
s
=
0
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
dst
,
src
,
size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
,
s
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
dev2host
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
dev2host
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
dev2host
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
dev2host
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
>
void
dev2dev
(
T
*
dst
,
const
T
*
src
,
size_t
size
,
cudaStream_t
s
=
0
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
dst
,
src
,
size
*
sizeof
(
T
),
cudaMemcpyDeviceToDevice
,
s
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
dev2dev
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
dev2dev
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
dev2dev
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
dev2dev
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
>
void
host2host
(
T
*
dst
,
const
T
*
src
,
size_t
size
,
cudaStream_t
s
=
0
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
dst
,
src
,
size
*
sizeof
(
T
),
cudaMemcpyHostToHost
,
s
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2host
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
const
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
host2host
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits1
,
template
<
class
>
class
PtrTraits2
,
typename
Tindex1
,
typename
Tindex2
>
void
host2host
(
TensorView
<
T
,
Rank
,
PtrTraits1
,
Tindex1
>
dst
,
const
TensorView
<
T
,
Rank
,
PtrTraits2
,
Tindex2
>
src
,
cudaStream_t
s
=
0
)
{
host2host
(
dst
.
data
(),
src
.
data
(),
std
::
min
(
dst
.
size
(),
src
.
size
()),
s
);
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
void
zero_dev
(
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
tensor
)
{
checkCudaErrors
(
cudaMemset
(
tensor
.
data
(),
0
,
tensor
.
size
()
*
sizeof
(
T
)));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
void
zero_dev
(
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
tensor
,
cudaStream_t
s
)
{
checkCudaErrors
(
cudaMemsetAsync
(
tensor
.
data
(),
0
,
tensor
.
size
()
*
sizeof
(
T
),
s
));
}
template
<
typename
T
,
int
Rank
,
template
<
class
>
class
PtrTraits
,
typename
Tindex
>
void
zero_host
(
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
tensor
)
{
std
::
fill
(
tensor
.
data
(),
tensor
.
data
()
+
tensor
.
size
(),
0
);
}
#endif
}
// namespace tv
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment