"vscode:/vscode.git/clone" did not exist on "17d9c844d9a31ff0bfde69d5d40b08a0d0b9c337"
Commit 731545bd authored by yanyan's avatar yanyan
Browse files

disable __hadd2 for arch < sm53

parent a75b2bad
...@@ -292,12 +292,19 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -292,12 +292,19 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>( reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]; buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
if (std::is_same<T, at::Half>::value) { if (std::is_same<T, at::Half>::value) {
#if __CUDA_ARCH__ >= 530
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadHalf2Factor; i++) { for (int i = 0; i < vecloadHalf2Factor; i++) {
reinterpret_cast<__half2 *>(buf)[i] = reinterpret_cast<__half2 *>(buf)[i] =
__hadd2(reinterpret_cast<__half2 *>(buf)[i], __hadd2(reinterpret_cast<__half2 *>(buf)[i],
reinterpret_cast<__half2 *>(buf2)[i]); reinterpret_cast<__half2 *>(buf2)[i]);
} }
#else
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
#endif
} else { } else {
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadFactor; i++) { for (int i = 0; i < vecloadFactor; i++) {
...@@ -328,6 +335,7 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer, ...@@ -328,6 +335,7 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
} }
} }
#if __CUDA_ARCH__ >= 530
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer, __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, const Index *indices, int size,
...@@ -349,6 +357,7 @@ __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer, ...@@ -349,6 +357,7 @@ __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
} }
} }
} }
#endif
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer, __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment