"platforms/cuda/vscode:/vscode.git/clone" did not exist on "484472ca687481bbdae47b0604f0d76de827cc44"
Commit 9f92ae40 authored by peastman's avatar peastman
Browse files

Fixed incorrect synchronization on Volta

parent 31d0993f
...@@ -8,9 +8,21 @@ typedef struct { ...@@ -8,9 +8,21 @@ typedef struct {
#endif #endif
} AtomData; } AtomData;
// Find the maximum of a value across all threads in a warp, and return that to
// every thread. This is only needed on Volta and later. On earlier architectures,
// we can just return the value that was passed in.
__device__ int reduceMax(int val) {
#if __CUDA_ARCH__ >= 700
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor(val, mask));
#endif
return val;
}
extern "C" __global__ void computeInteractionGroups( extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
int* __restrict__ numGroupTiles, bool useNeighborList, const int* __restrict__ numGroupTiles, bool useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
...@@ -43,7 +55,10 @@ extern "C" __global__ void computeInteractionGroups( ...@@ -43,7 +55,10 @@ extern "C" __global__ void computeInteractionGroups(
localData[threadIdx.x].fy = 0.0f; localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f; localData[threadIdx.x].fz = 0.0f;
int tj = tgx; int tj = tgx;
for (int j = rangeStart; j < rangeEnd; j++) { int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
SYNC_WARPS;
for (int j = rangeStart; j < rangeStop; j++) {
if (j < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q); posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
...@@ -75,6 +90,8 @@ extern "C" __global__ void computeInteractionGroups( ...@@ -75,6 +90,8 @@ extern "C" __global__ void computeInteractionGroups(
#endif #endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1); tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
} }
SYNC_WARPS;
}
if (exclusions != 0) { if (exclusions != 0) {
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
...@@ -83,6 +100,7 @@ extern "C" __global__ void computeInteractionGroups( ...@@ -83,6 +100,7 @@ extern "C" __global__ void computeInteractionGroups(
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000))); atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
SYNC_WARPS;
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_DERIVATIVES SAVE_DERIVATIVES
...@@ -134,9 +152,10 @@ extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborL ...@@ -134,9 +152,10 @@ extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborL
if (tgx == 0) if (tgx == 0)
anyInteraction[local_warp] = false; anyInteraction[local_warp] = false;
int tj = tgx; int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
SYNC_WARPS; SYNC_WARPS;
for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) { for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
if (tj < rangeEnd) { if (j < rangeEnd && tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z); real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment