Unverified Commit abadc821 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Fixed errors in OpenCL on CPU (#4358)

parent e854f8f3
...@@ -83,7 +83,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -83,7 +83,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster. // list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 100000); useLargeBlocks = (!deviceIsCpu && context.getNumAtoms() > 100000);
std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>(); std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.")); isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));
......
...@@ -66,6 +66,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri ...@@ -66,6 +66,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
rebuildNeighborList[0] = 0; rebuildNeighborList[0] = 0;
} }
__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) { __kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks. // Find the total range of sizes recorded by all blocks.
......
...@@ -7,9 +7,10 @@ ...@@ -7,9 +7,10 @@
*/ */
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
__global real2* restrict sortedBlocks) { __global real2* restrict blockSizeRange) {
int index = get_global_id(0); int index = get_global_id(0);
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
real minSize = 1e38, maxSize = 0;
while (base < numAtoms) { while (base < numAtoms) {
real4 pos = posq[base]; real4 pos = posq[base];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -28,20 +29,79 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri ...@@ -28,20 +29,79 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
maxPos = max(maxPos, pos); maxPos = max(maxPos, pos);
} }
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
for (int i = base; i < last; i++) {
pos = posq[i];
real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
}
center.w = sqrt(center.w);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = center;
sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index); real totalSize = blockSize.x+blockSize.y+blockSize.z;
minSize = min(minSize, totalSize);
maxSize = max(maxSize, totalSize);
index += get_global_size(0); index += get_global_size(0);
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
// Record the range of sizes seen by threads in this block.
__local real minBuffer[64], maxBuffer[64];
minBuffer[get_local_id(0)] = minSize;
maxBuffer[get_local_id(0)] = maxSize;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < 64; step *= 2) {
if (get_local_id(0)+step < 64 && get_local_id(0)%(2*step) == 0) {
minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
blockSizeRange[get_group_id(0)] = make_real2(minBuffer[0], maxBuffer[0]);
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
rebuildNeighborList[0] = 0; rebuildNeighborList[0] = 0;
} }
__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks.
__local real2 sizeRange;
if (get_local_id(0) == 0) {
sizeRange = blockSizeRange[0];
for (int i = 1; i < numSizes; i++) {
real2 size = blockSizeRange[i];
sizeRange.x = min(sizeRange.x, size.x);
sizeRange.y = max(sizeRange.y, size.y);
}
sizeRange.x = LOG(sizeRange.x);
sizeRange.y = LOG(sizeRange.y);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Sort keys store the bin in the high order part and the block in the low
// order part.
int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
for (unsigned int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}
/** /**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel. * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/ */
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter, __kernel void sortBoxData(__global const unsigned int* restrict sortedBlocks, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter, __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions, __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
...@@ -51,7 +111,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -51,7 +111,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
#endif #endif
) { ) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y; unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index]; sortedBlockBoundingBox[i] = blockBoundingBox[index];
} }
...@@ -166,7 +226,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num ...@@ -166,7 +226,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
*/ */
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList __global const int* restrict rebuildNeighborList
...@@ -187,8 +247,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -187,8 +247,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) { for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
valuesInBuffer = 0; valuesInBuffer = 0;
numAtoms = 0; numAtoms = 0;
real2 sortedKey = sortedBlocks[i]; int x = sortedBlocks[i] & BLOCK_INDEX_MASK;
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i]; real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i]; real4 blockSizeX = sortedBlockBoundingBox[i];
...@@ -204,7 +263,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -204,7 +263,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int j = i+1; j < NUM_BLOCKS; j++) { for (int j = i+1; j < NUM_BLOCKS; j++) {
real2 sortedKey2 = sortedBlocks[j]; real2 sortedKey2 = sortedBlocks[j];
int y = (int) sortedKey2.y; int y = sortedBlocks[j] & BLOCK_INDEX_MASK;
bool hasExclusions = false; bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y); hasExclusions |= (exclusionsForX[k] == y);
......
...@@ -10,11 +10,7 @@ typedef struct { ...@@ -10,11 +10,7 @@ typedef struct {
*/ */
__kernel void computeNonbonded( __kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -102,14 +98,9 @@ __kernel void computeNonbonded( ...@@ -102,14 +98,9 @@ __kernel void computeNonbonded(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
else { else {
...@@ -178,32 +169,18 @@ __kernel void computeNonbonded( ...@@ -178,32 +169,18 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
} }
} }
} }
...@@ -337,14 +314,9 @@ __kernel void computeNonbonded( ...@@ -337,14 +314,9 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
else else
...@@ -404,14 +376,9 @@ __kernel void computeNonbonded( ...@@ -404,14 +376,9 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
...@@ -424,18 +391,9 @@ __kernel void computeNonbonded( ...@@ -424,18 +391,9 @@ __kernel void computeNonbonded(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment