Commit 9824c5f1 authored by rusty1s's avatar rusty1s
Browse files
parents 46040140 bdfdb070
......@@ -18,7 +18,7 @@ before_install:
- export CXX="g++-4.9"
install:
- if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp27-cp27mu-linux_x86_64.whl; fi
- if [[ $TRAVIS_PYTHON_VERSION == 3.5 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl; fi
- if [[ $TRAVIS_PYTHON_VERSION == 3.5 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl; fi
- if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp36-cp36m-linux_x86_64.whl; fi
- pip install pycodestyle
- pip install flake8
......
......@@ -21,7 +21,7 @@ The operator works on all floating point data types and is implemented both for
## Installation
Ensure that at least PyTorch 0.4.1 is installed and verify that `cuda/bin` and `cuda/install` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
Ensure that at least PyTorch 0.4.1 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
```
$ python -c "import torch; print(torch.__version__)"
......@@ -31,7 +31,7 @@ $ echo $PATH
>>> /usr/local/cuda/bin:...
$ echo $CPATH
>>> /usr/local/cuda/install:...
>>> /usr/local/cuda/include:...
```
Then run:
......@@ -87,7 +87,7 @@ The kernel function is defined over the weighted B-spline tensor product basis,
### Returns
* **out** *(Tensor)* - out node features of shape `(number_of_nodes x out_channels)`.
* **out** *(Tensor)* - Out node features of shape `(number_of_nodes x out_channels)`.
### Example
......
#include "THCBasis.cu"
#include "THCWeighting.cu"
#ifndef THC_INC
#define THC_INC
#include "THCBasis.h"
#include "THCWeighting.h"
#endif // THC_INC
#ifndef THC_ATOMICS_INC
#define THC_ATOMICS_INC
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
static inline __device__ void atomicAdd(double *address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*) address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
}
#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
static inline __device__ void atomicAdd(double *address, double val) {}
#endif
#ifdef CUDA_HALF_TENSOR
static inline __device__ void atomicAdd(half *address, half val) {
unsigned int * address_as_ui = (unsigned int *) ((char *) address - ((size_t) address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
do {
assumed = old;
#if CUDA_VERSION < 9000
half hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
hsum = THCNumerics<half>::add(hsum, val);
#else // CUDA_VERSION < 9000
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
half tmpres = THCNumerics<half>::add(hsum, val);
hsum = __half_raw(tmpres);
#endif // CUDA_VERSION
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
#endif // CUDA_HALF_TENSOR
#endif // THC_ATOMICS_INC
#include "THCBasis.h"
#include "THCBasisForward.cuh"
#include "THCBasisBackward.cuh"
template<typename T>
__global__ void linearBasisForwardKernel(TensorInfo<T> basis, TensorInfo<int64_t>weightIndex,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_FORWARD_KERNEL(1, basis, weightIndex, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::linear(v, kMod))
}
template<typename T>
__global__ void quadraticBasisForwardKernel(TensorInfo<T> basis, TensorInfo<int64_t>weightIndex,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_FORWARD_KERNEL(2, basis, weightIndex, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::quadratic(v, kMod))
}
template<typename T>
__global__ void cubicBasisForwardKernel(TensorInfo<T> basis, TensorInfo<int64_t>weightIndex,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_FORWARD_KERNEL(3, basis, weightIndex, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::cubic(v, kMod))
}
template<typename T>
__global__ void linearBasisBackwardKernel(TensorInfo<T> self, TensorInfo<T>gradBasis,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_BACKWARD_KERNEL(1, self, gradBasis, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::linear(v, kMod),
BasisBackward<T>::linear(v, kMod))
}
template<typename T>
__global__ void quadraticBasisBackwardKernel(TensorInfo<T> self, TensorInfo<T>gradBasis,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_BACKWARD_KERNEL(2, self, gradBasis, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::quadratic(v, kMod),
BasisBackward<T>::quadratic(v, kMod))
}
template<typename T>
__global__ void cubicBasisBackwardKernel(TensorInfo<T> self, TensorInfo<T>gradBasis,
TensorInfo<T> pseudo, int64_t *kernelSize,
uint8_t *isOpenSpline, ptrdiff_t n) {
THC_TENSOR_BASIS_BACKWARD_KERNEL(3, self, gradBasis, pseudo, kernelSize, isOpenSpline, n,
BasisForward<T>::cubic(v, kMod),
BasisBackward<T>::cubic(v, kMod))
}
#include "generic/THCBasis.cu"
#include "THC/THCGenerateFloatTypes.h"
#ifndef THC_BASIS_INC
#define THC_BASIS_INC
#include <THC/THC.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
#include "generic/THCBasis.h"
#include "THC/THCGenerateFloatTypes.h"
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // THC_BASIS_INC
#ifndef THC_BASIS_BACKWARD_INC
#define THC_BASIS_BACKWARD_INC
#include "common.cuh"
#include "THCNumerics.cuh"
#define THC_TENSOR_BASIS_BACKWARD(NAME, state, self, gradBasis, pseudo, kernelSize, \
isOpenSpline) { \
THCAssertSameGPU(THCTensor_(checkGPU)(state, 5, self, gradBasis, pseudo, kernelSize, \
isOpenSpline)); \
\
TensorInfo<real> selfInfo = THCTensor_(getTensorInfo)(state, self); \
TensorInfo<real> gradBasisInfo = THCTensor_(getTensorInfo)(state, gradBasis); \
TensorInfo<real> pseudoInfo = THCTensor_(getTensorInfo)(state, pseudo); \
int64_t *kernelSizeData = THCudaLongTensor_data(state, kernelSize); \
uint8_t *isOpenSplineData = THCudaByteTensor_data(state, isOpenSpline); \
\
KERNEL_REAL_RUN(NAME, THCTensor_(nElement)(state, pseudo), selfInfo, gradBasisInfo, pseudoInfo, \
kernelSizeData, isOpenSplineData); \
}
#define THC_TENSOR_BASIS_BACKWARD_KERNEL(M, self, gradBasis, pseudo, kernelSize, isOpenSpline, \
N, CODE, GRAD_CODE) { \
KERNEL_LOOP(i, N) { \
ptrdiff_t e = i / self.size[1], d = i % self.size[1], s, dIt, dOther; \
int64_t kMod; \
T g = ScalarConvert<int, T>::to(0), v, tmp; \
for (s = 0; s < gradBasis.size[1]; s++) { \
kMod = (s / (ptrdiff_t) pow((float) M + 1, (float) d)) % (M + 1); \
v = pseudo.data[e * pseudo.stride[0] + d * pseudo.stride[1]]; \
v = THCNumerics<T>::mul(v, ScalarConvert<int64_t, T>::to(kernelSize[d] - M * isOpenSpline[d])); \
v = THCNumerics<T>::sub(v, ScalarConvert<int64_t, T>::to(ScalarConvert<T, int64_t>::to(v))); \
v = GRAD_CODE; \
tmp = v; \
\
for (dIt = 1; dIt < pseudo.size[1]; dIt++) { \
dOther = dIt - (d >= dIt); \
kMod = (s / (ptrdiff_t) pow((float) M + 1, (float) dOther)) % (M + 1); \
v = pseudo.data[e * pseudo.stride[0] + dOther * pseudo.stride[1]]; \
v = THCNumerics<T>::mul(v, ScalarConvert<int64_t, T>::to(kernelSize[dOther] - M * isOpenSpline[dOther])); \
v = THCNumerics<T>::sub(v, ScalarConvert<int64_t, T>::to(ScalarConvert<T, int64_t>::to(v))); \
v = CODE; \
tmp = THCNumerics<T>::mul(tmp, v); \
} \
\
tmp = THCNumerics<T>::mul(tmp, gradBasis.data[e * gradBasis.stride[0] + s * gradBasis.stride[1]]); \
g = THCNumerics<T>::add(g, tmp); \
} \
g = THCNumerics<T>::mul(g, ScalarConvert<int64_t, T>::to(kernelSize[d] - M * isOpenSpline[d])); \
self.data[e * self.stride[0] + d * self.stride[1]] = g; \
} \
}
template<typename T>
struct BasisBackward {
static inline __device__ T linear(T v, int64_t kMod) {
// 2 * kMod - 1
return ScalarConvert<int64_t, T>::to(2 * kMod - 1);
}
static inline __device__ T quadratic(T v, int64_t kMod) {
if (kMod == 0) {
// v - 1
return THCNumerics<T>::sub(v, ScalarConvert<int, T>::to(1));
}
else if (kMod == 1) {
// -2 * v + 1
T tmp = THCNumerics<T>::mul(ScalarConvert<int, T>::to(-2), v);
return THCNumerics<T>::add(tmp, ScalarConvert<int, T>::to(1));
}
else return v;
}
static inline __device__ T cubic(T v, int64_t kMod) {
if (kMod == 0) {
// (-v * v + 2 * v - 1) / 2
T tmp1 = THCNumerics<T>::mul(THCNumerics<T>::neg(v), v);
T tmp2 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(2), v);
tmp1 = THCNumerics<T>::sub(THCNumerics<T>::add(tmp1, tmp2), ScalarConvert<int, T>::to(1));
return THCNumerics<T>::div(tmp1, ScalarConvert<int, T>::to(2));
}
else if (kMod == 1) {
// (3 * v * v - 4 * v) / 2
T tmp = THCNumerics<T>::mul(ScalarConvert<int, T>::to(3), THCNumerics<T>::mul(v, v));
tmp = THCNumerics<T>::sub(tmp, THCNumerics<T>::mul(ScalarConvert<int, T>::to(4), v));
return THCNumerics<T>::div(tmp, ScalarConvert<int, T>::to(2));
}
else if (kMod == 2) {
T tmp = THCNumerics<T>::mul(ScalarConvert<int, T>::to(-3), THCNumerics<T>::mul(v, v));
tmp = THCNumerics<T>::add(tmp, THCNumerics<T>::mul(ScalarConvert<int, T>::to(2), v));
tmp = THCNumerics<T>::add(tmp, ScalarConvert<int, T>::to(1));
return THCNumerics<T>::div(tmp, ScalarConvert<int, T>::to(2));
}
else {
// v * v / 2;
return THCNumerics<T>::div(THCNumerics<T>::mul(v, v), ScalarConvert<int, T>::to(2));
}
}
};
#endif // THC_BASIS_BACKWARD_INC
#ifndef THC_BASIS_FORWARD_INC
#define THC_BASIS_FORWARD_INC
#include "common.cuh"
#include "THCNumerics.cuh"
#define THC_TENSOR_BASIS_FORWARD(NAME, state, basis, weightIndex, pseudo, kernelSize, \
isOpenSpline) { \
THCAssertSameGPU(THCTensor_(checkGPU)(state, 5, basis, weightIndex, pseudo, kernelSize, \
isOpenSpline)); \
\
TensorInfo<real> basisInfo = THCTensor_(getTensorInfo)(state, basis); \
TensorInfo<int64_t> weightIndexInfo = THCudaLongTensor_getTensorInfo(state, weightIndex); \
TensorInfo<real> pseudoInfo = THCTensor_(getTensorInfo)(state, pseudo); \
int64_t *kernelSizeData = THCudaLongTensor_data(state, kernelSize); \
uint8_t *isOpenSplineData = THCudaByteTensor_data(state, isOpenSpline); \
\
KERNEL_REAL_RUN(NAME, THCTensor_(nElement)(state, basis), basisInfo, weightIndexInfo, \
pseudoInfo, kernelSizeData, isOpenSplineData); \
}
#define THC_TENSOR_BASIS_FORWARD_KERNEL(M, basis, weightIndex, pseudo, kernelSize, isOpenSpline, \
N, CODE) { \
KERNEL_LOOP(i, N) { \
ptrdiff_t e = i / basis.size[1], s = i % basis.size[1], d; \
int64_t k = s, kMod, wi = 0, wiOffset = 1; \
T b = ScalarConvert<int, T>::to(1), v; \
\
for (d = 0; d < pseudo.size[1]; d++) { \
kMod = k % (M + 1); \
k /= M + 1; \
\
v = pseudo.data[e * pseudo.stride[0] + d * pseudo.stride[1]]; \
v = THCNumerics<T>::mul(v, ScalarConvert<int64_t, T>::to(kernelSize[d] - M * isOpenSpline[d])); \
\
wi += ((ScalarConvert<T, int64_t>::to(v) + kMod) % kernelSize[d]) * wiOffset; \
wiOffset *= kernelSize[d]; \
\
v = THCNumerics<T>::sub(v, ScalarConvert<int64_t, T>::to(ScalarConvert<T, int64_t>::to(v))); \
v = CODE; \
b = THCNumerics<T>::mul(b, v); \
} \
\
basis.data[e * basis.stride[0] + s * basis.stride[1]] = b; \
weightIndex.data[e * weightIndex.stride[0] + s * weightIndex.stride[1]] = wi; \
} \
}
template<typename T>
struct BasisForward {
static inline __device__ T linear(T v, int64_t kMod) {
// 1 - v - kMod + 2 * v * kMod
T tmp1 = THCNumerics<T>::sub(ScalarConvert<int, T>::to(1), v);
tmp1 = THCNumerics<T>::sub(tmp1, ScalarConvert<int64_t, T>::to(kMod));
T tmp2 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(2), v);
tmp2 = THCNumerics<T>::mul(tmp2, ScalarConvert<int64_t, T>::to(kMod));
return THCNumerics<T>::add(tmp1, tmp2);
}
static inline __device__ T quadratic(T v, int64_t kMod) {
if (kMod == 0) {
// 0.5 * v * v - v + 0.5
T tmp = THCNumerics<T>::mul(THCNumerics<T>::mul(ScalarConvert<float, T>::to(0.5), v), v);
return THCNumerics<T>::add(THCNumerics<T>::sub(tmp, v), ScalarConvert<float, T>::to(0.5));
}
else if (kMod == 1) {
// -v * v + v + 0.5
T tmp = THCNumerics<T>::mul(THCNumerics<T>::neg(v), v);
return THCNumerics<T>::add(THCNumerics<T>::add(tmp, v), ScalarConvert<float, T>::to(0.5));
}
else {
// 0.5 * v * v
return THCNumerics<T>::mul(ScalarConvert<float, T>::to(0.5), THCNumerics<T>::mul(v, v));
}
}
static inline __device__ T cubic(T v, int64_t kMod) {
if (kMod == 0) {
// (1 - v) * (1 -v) * (1 - v) / 6
T tmp = THCNumerics<T>::sub(ScalarConvert<int, T>::to(1), v);
tmp = THCNumerics<T>::mul(THCNumerics<T>::mul(tmp, tmp), tmp);
return THCNumerics<T>::div(tmp, ScalarConvert<int, T>::to(6));
}
else if (kMod == 1) {
// (3 * v * v * v - 6 * v * v + 4) / 6
T tmp1 = THCNumerics<T>::mul(THCNumerics<T>::mul(v, v), v);
tmp1 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(3), tmp1);
T tmp2 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(6), THCNumerics<T>::mul(v, v));
tmp1 = THCNumerics<T>::add(THCNumerics<T>::sub(tmp1, tmp2), ScalarConvert<int, T>::to(4));
return THCNumerics<T>::div(tmp1, ScalarConvert<int, T>::to(6));
}
else if (kMod == 2) {
// (-3 * v * v * v + 3 * v * v + 3 * v + 1) / 6
T tmp1 = THCNumerics<T>::mul(THCNumerics<T>::mul(v, v), v);
tmp1 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(-3), tmp1);
T tmp2 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(3), THCNumerics<T>::mul(v, v));
T tmp3 = THCNumerics<T>::mul(ScalarConvert<int, T>::to(3), v);
tmp1 = THCNumerics<T>::add(THCNumerics<T>::add(tmp1, tmp2), tmp3);
tmp1 = THCNumerics<T>::add(tmp1, ScalarConvert<int, T>::to(1));
return THCNumerics<T>::div(tmp1, ScalarConvert<int, T>::to(6));
}
else {
// v * v * v / 6
T tmp = THCNumerics<T>::mul(THCNumerics<T>::mul(v, v), v);
return THCNumerics<T>::div(tmp, ScalarConvert<int, T>::to(6));
}
}
};
#endif // THC_BASIS_FORWARD_INC
#ifndef THC_NUMERICS_INC
#define THC_NUMERICS_INC
#include <THC/THCHalf.h>
#ifdef CUDA_HALF_TENSOR
#ifdef __CUDA_ARCH__
#define h2f(A) __half2float(A)
#define f2h(A) __float2half(A)
#else // CUDA_ARCH__
#define h2f(A) THC_half2float(A)
#define f2h(A) THC_float2half(A)
#endif // CUDA_ARCH__
#endif // CUDA_HALF_TENSOR
template<typename T>
struct THCNumerics {
static inline __host__ __device__ T add(T a, T b) { return a + b; }
static inline __host__ __device__ T sub(T a, T b) { return a - b; }
static inline __host__ __device__ T mul(T a, T b) { return a * b; }
static inline __host__ __device__ T div(T a, T b) { return a / b; }
static inline __host__ __device__ T neg(T a) { return -a; }
};
#ifdef CUDA_HALF_TENSOR
template<>
struct THCNumerics<half> {
static inline __host__ __device__ half add(half a, half b) { return f2h(h2f(a) + h2f(b)); }
static inline __host__ __device__ half sub(half a, half b) { return f2h(h2f(a) - h2f(b)); }
static inline __host__ __device__ half mul(half a, half b) { return f2h(h2f(a) * h2f(b)); }
static inline __host__ __device__ half div(half a, half b) { return f2h(h2f(a) / h2f(b)); }
static inline __host__ __device__ half neg(half a) { return f2h(-h2f(a)); }
};
#endif // CUDA_HALF_TENSOR
template <typename In, typename Out>
struct ScalarConvert {
static __host__ __device__ Out to(const In v) { return (Out) v; }
};
#ifdef CUDA_HALF_TENSOR
template <typename Out>
struct ScalarConvert<half, Out> {
static __host__ __device__ Out to(const half v) { return (Out) h2f(v); }
};
template <typename In>
struct ScalarConvert<In, half> {
static __host__ __device__ half to(const In v) { return f2h((float) v); }
};
#endif // CUDA_HALF_TENSOR
#endif // THC_NUMERICS_INC
#include "THCWeighting.h"
#include "common.cuh"
#include "THCNumerics.cuh"
#include "THCAtomics.cuh"
#define TH_TENSOR_WEIGHTING(NAME, N, TENSOR1, TENSOR2, TENSOR3, TENSOR4, weightIndex) { \
THCAssertSameGPU(THCTensor_(checkGPU)(state, 5, TENSOR1, TENSOR2, TENSOR3, TENSOR4, weightIndex)); \
\
TensorInfo<real> tensor1Info = THCTensor_(getTensorInfo)(state, TENSOR1); \
TensorInfo<real> tensor2Info = THCTensor_(getTensorInfo)(state, TENSOR2); \
TensorInfo<real> tensor3Info = THCTensor_(getTensorInfo)(state, TENSOR3); \
TensorInfo<real> tensor4Info = THCTensor_(getTensorInfo)(state, TENSOR4); \
TensorInfo<int64_t> weightIndexInfo = THCudaLongTensor_getTensorInfo(state, weightIndex); \
\
KERNEL_REAL_RUN(NAME, N, tensor1Info, tensor2Info, tensor3Info, tensor4Info, weightIndexInfo); \
}
template<typename T>
__global__ void weightingForwardKernel(TensorInfo<T> self, TensorInfo<T> src, TensorInfo<T> weight,
TensorInfo<T> basis, TensorInfo<int64_t> weightIndex,
int n) {
KERNEL_LOOP(i, n) {
ptrdiff_t e = i / self.size[1], mOut = i % self.size[1], s, mIn;
T v = ScalarConvert<int, T>::to(0), b, tmp;
int64_t wi;
for (s = 0; s < basis.size[1]; s++) {
b = basis.data[e * basis.stride[0] + s * basis.stride[1]];
wi = weightIndex.data[e * weightIndex.stride[0] + s * weightIndex.stride[1]];
for (mIn = 0; mIn < src.size[1]; mIn++) {
tmp = weight.data[wi * weight.stride[0] + mIn * weight.stride[1] + mOut * weight.stride[2]];
tmp = THCNumerics<T>::mul(tmp, src.data[e * src.stride[0] + mIn * src.stride[1]]);
tmp = THCNumerics<T>::mul(tmp, b);
v = THCNumerics<T>::add(v, tmp);
}
}
self.data[e * self.stride[0] + mOut * self.stride[1]] = v;
}
}
template<typename T>
__global__ void weightingBackwardSrcKernel(TensorInfo<T> self, TensorInfo<T> gradOutput,
TensorInfo<T> weight, TensorInfo<T> basis,
TensorInfo<int64_t> weightIndex, int n) {
KERNEL_LOOP(i, n) {
ptrdiff_t e = i / self.size[1], mIn = i % self.size[1], s, mOut;
T v = ScalarConvert<int, T>::to(0), b, tmp;
int64_t wi;
for (s = 0; s < basis.size[1]; s++) {
wi = weightIndex.data[e * weightIndex.stride[0] + s * weightIndex.stride[1]];
b = basis.data[e * basis.stride[0] + s * basis.stride[1]];
for (mOut = 0; mOut < gradOutput.size[1]; mOut++) {
tmp = weight.data[wi * weight.stride[0] + mOut * weight.stride[1] + mIn * weight.stride[2]];
tmp = THCNumerics<T>::mul(tmp, gradOutput.data[e * gradOutput.stride[0] + mOut * gradOutput.stride[1]]);
tmp = THCNumerics<T>::mul(tmp, b);
v = THCNumerics<T>::add(v, tmp);
}
}
self.data[e * self.stride[0] + mIn * self.stride[1]] = v;
}
}
template<typename T>
__global__ void weightingBackwardWeightKernel(TensorInfo<T> self, TensorInfo<T> gradOutput,
TensorInfo<T> src, TensorInfo<T> basis,
TensorInfo<int64_t> weightIndex, int n) {
KERNEL_LOOP(i, n) {
ptrdiff_t e = i / gradOutput.size[1], mOut = i % gradOutput.size[1], s, mIn;
T b, v;
int64_t wi;
T g = gradOutput.data[e * gradOutput.stride[0] + mOut * gradOutput.stride[1]];
for (s = 0; s < weightIndex.size[1]; s++) {
b = basis.data[e * basis.stride[0] + s * basis.stride[1]];
wi = weightIndex.data[e * weightIndex.stride[0] + s * weightIndex.stride[1]];
for (mIn = 0; mIn < src.size[1]; mIn++) {
v = src.data[e * src.stride[0] + mIn * src.stride[1]];
v = THCNumerics<T>::mul(THCNumerics<T>::mul(v, b), g);
atomicAdd(&self.data[wi * self.stride[0] + mIn * self.stride[1] + mOut * self.stride[2]], v);
}
}
}
}
template<typename T>
__global__ void weightingBackwardBasisKernel(TensorInfo<T> self, TensorInfo<T> gradOutput,
TensorInfo<T> src, TensorInfo<T> weight,
TensorInfo<int64_t> weightIndex, int n) {
KERNEL_LOOP(i, n) {
ptrdiff_t e = i / gradOutput.size[1], mOut = i % gradOutput.size[1], s, mIn;
T v, tmp;
int64_t wi;
T g = gradOutput.data[e * gradOutput.stride[0] + mOut * gradOutput.stride[1]];
for (s = 0; s < weightIndex.size[1]; s++) {
v = ScalarConvert<int, T>::to(0);
wi = weightIndex.data[e * weightIndex.stride[0] + s * weightIndex.stride[1]];
for (mIn = 0; mIn < src.size[1]; mIn++) {
tmp = weight.data[wi * weight.stride[0] + mIn * weight.stride[1] + mOut * weight.stride[2]];
tmp = THCNumerics<T>::mul(tmp, src.data[e * src.stride[0] + mIn * src.stride[1]]);
tmp = THCNumerics<T>::mul(tmp, g);
v = THCNumerics<T>::add(v, tmp);
}
atomicAdd(&self.data[e * self.stride[0] + s * self.stride[1]], v);
}
}
}
#include "generic/THCWeighting.cu"
#include "THC/THCGenerateFloatTypes.h"
#ifndef THC_WEIGHTING_INC
#define THC_WEIGHTING_INC
#include <THC/THC.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
#include "generic/THCWeighting.h"
#include "THC/THCGenerateFloatTypes.h"
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // THC_WEIGHTING_INC
#ifndef THC_COMMON_INC
#define THC_COMMON_INC
#define KERNEL_LOOP(I, N) \
for (ptrdiff_t I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x)
const int MAX_DIMS = 25;
const int NUM_THREADS = 1024;
inline int GET_BLOCKS(int N) {
return (N + NUM_THREADS - 1) / NUM_THREADS;
}
#define KERNEL_REAL_RUN(NAME, N, ...) \
int grid = GET_BLOCKS(N); \
cudaStream_t stream = THCState_getCurrentStream(state); \
NAME<real><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
THCudaCheck(cudaGetLastError())
template<typename T>
struct TensorInfo {
T *data;
int dims;
int size[MAX_DIMS];
int stride[MAX_DIMS];
};
#include "generic/common.cuh"
#include "THC/THCGenerateAllTypes.h"
#endif // THC_COMMON_INC
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCBasis.cu"
#else
void THCTensor_(linearBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_FORWARD(linearBasisForwardKernel, state, basis, weightIndex, pseudo, kernelSize,
isOpenSpline)
}
void THCTensor_(quadraticBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_FORWARD(quadraticBasisForwardKernel, state, basis, weightIndex, pseudo,
kernelSize, isOpenSpline)
}
void THCTensor_(cubicBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_FORWARD(cubicBasisForwardKernel, state, basis, weightIndex, pseudo, kernelSize,
isOpenSpline)
}
void THCTensor_(linearBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_BACKWARD(linearBasisBackwardKernel, state, self, gradBasis, pseudo, kernelSize,
isOpenSpline)
}
void THCTensor_(quadraticBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_BACKWARD(quadraticBasisBackwardKernel, state, self, gradBasis, pseudo,
kernelSize, isOpenSpline)
}
void THCTensor_(cubicBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline) {
THC_TENSOR_BASIS_BACKWARD(cubicBasisBackwardKernel, state, self, gradBasis, pseudo, kernelSize,
isOpenSpline)
}
#endif // THC_GENERIC_FILE
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCBasis.h"
#else
void THCTensor_(linearBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCTensor_(quadraticBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline);
void THCTensor_(cubicBasisForward)(THCState *state, THCTensor *basis,
THCudaLongTensor *weightIndex, THCTensor *pseudo,
THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCTensor_(linearBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline);
void THCTensor_(quadraticBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline);
void THCTensor_(cubicBasisBackward)(THCState *state, THCTensor *self, THCTensor *gradBasis,
THCTensor *pseudo, THCudaLongTensor *kernelSize,
THCudaByteTensor *isOpenSpline);
#endif // THC_GENERIC_FILE
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCWeighting.cu"
#else
void THCTensor_(weightingForward)(THCState *state, THCTensor *self, THCTensor *src,
THCTensor *weight, THCTensor *basis,
THCudaLongTensor *weightIndex) {
TH_TENSOR_WEIGHTING(weightingForwardKernel, THCTensor_(nElement)(state, self), self, src, weight,
basis, weightIndex)
}
void THCTensor_(weightingBackwardSrc)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *weight, THCTensor *basis,
THCudaLongTensor *weightIndex) {
THCTensor *tWeight = THCTensor_(newTranspose)(state, weight, 1, 2);
weight = THCTensor_(newContiguous)(state, tWeight);
TH_TENSOR_WEIGHTING(weightingBackwardSrcKernel, THCTensor_(nElement)(state, self), self,
gradOutput, weight, basis, weightIndex)
THCTensor_(free)(state, tWeight);
THCTensor_(free)(state, weight);
}
void THCTensor_(weightingBackwardWeight)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *src, THCTensor *basis,
THCudaLongTensor *weightIndex) {
THCTensor_(fill)(state, self, ScalarConvert<int, real>::to(0));
TH_TENSOR_WEIGHTING(weightingBackwardWeightKernel, THCTensor_(nElement)(state, gradOutput), self,
gradOutput, src, basis, weightIndex)
}
void THCTensor_(weightingBackwardBasis)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *src, THCTensor *weight,
THCudaLongTensor *weightIndex) {
THCTensor_(fill)(state, self, ScalarConvert<int, real>::to(0));
TH_TENSOR_WEIGHTING(weightingBackwardBasisKernel, THCTensor_(nElement)(state, gradOutput), self,
gradOutput, src, weight, weightIndex)
}
#endif // THC_GENERIC_FILE
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCWeighting.h"
#else
void THCTensor_(weightingForward)(THCState *state, THCTensor *self, THCTensor *src,
THCTensor *weight, THCTensor *basis,
THCudaLongTensor *weightIndex);
void THCTensor_(weightingBackwardSrc)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *weight, THCTensor *basis,
THCudaLongTensor *weightIndex);
void THCTensor_(weightingBackwardWeight)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *src, THCTensor *basis,
THCudaLongTensor *weightIndex);
void THCTensor_(weightingBackwardBasis)(THCState *state, THCTensor *self, THCTensor *gradOutput,
THCTensor *src, THCTensor *weight,
THCudaLongTensor *weightIndex);
#endif // THC_GENERIC_FILE
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/common.cuh"
#else
TensorInfo<real> THCTensor_(getTensorInfo)(THCState *state, THCTensor *tensor) {
TensorInfo<real> tensorInfo = TensorInfo<real>();
tensorInfo.data = THCTensor_(data)(state, tensor);
tensorInfo.dims = THCTensor_(nDimension)(state, tensor);
for (ptrdiff_t d = 0; d < tensorInfo.dims; d++) {
tensorInfo.size[d] = THCTensor_(size)(state, tensor, d);
tensorInfo.stride[d] = THCTensor_(stride)(state, tensor, d);
}
return tensorInfo;
}
#endif // THC_GENERIC_FILE
#include <THC/THC.h>
#include "THC.h"
#define THCCTensor_(NAME) TH_CONCAT_4(THCC,Real,Tensor_,NAME)
extern THCState *state;
#include "generic/THCCBasis.c"
#include "THCGenerateFloatTypes.h"
void THCCFloatTensor_linearBasisForward( THCudaTensor *basis, THCudaLongTensor *weightIndex, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_linearBasisForward(THCudaDoubleTensor *basis, THCudaLongTensor *weightIndex, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCFloatTensor_quadraticBasisForward( THCudaTensor *basis, THCudaLongTensor *weightIndex, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_quadraticBasisForward(THCudaDoubleTensor *basis, THCudaLongTensor *weightIndex, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCFloatTensor_cubicBasisForward( THCudaTensor *basis, THCudaLongTensor *weightIndex, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_cubicBasisForward(THCudaDoubleTensor *basis, THCudaLongTensor *weightIndex, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCFloatTensor_linearBasisBackward( THCudaTensor *self, THCudaTensor *gradBasis, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_linearBasisBackward(THCudaDoubleTensor *self, THCudaDoubleTensor *gradBasis, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCFloatTensor_quadraticBasisBackward( THCudaTensor *self, THCudaTensor *gradBasis, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_quadraticBasisBackward(THCudaDoubleTensor *self, THCudaDoubleTensor *gradBasis, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCFloatTensor_cubicBasisBackward( THCudaTensor *self, THCudaTensor *gradBasis, THCudaTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
void THCCDoubleTensor_cubicBasisBackward(THCudaDoubleTensor *self, THCudaDoubleTensor *gradBasis, THCudaDoubleTensor *pseudo, THCudaLongTensor *kernelSize, THCudaByteTensor *isOpenSpline);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment