"platforms/reference/tests/TestReferenceRGForce.cpp" did not exist on "85da5e0f9018719e95350c76b483a2160c84d9d3"
Commit 4405b165 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony Tye's optimization to reduce local memory use

parent 69e75377
...@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
...@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
if (context.getSIMDWidth() == 32) defines["NONBONDED_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize/OpenCLContext::TileSize);
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
...@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : forceThreadBlockSize*localDataSize), NULL); kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : forceThreadBlockSize*localDataSize), NULL);
kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex); kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles); kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
if (useCutoff) { if (useCutoff) {
......
...@@ -12,7 +12,7 @@ typedef struct { ...@@ -12,7 +12,7 @@ typedef struct {
*/ */
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
......
...@@ -11,9 +11,9 @@ typedef struct { ...@@ -11,9 +11,9 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(NONBONDED_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
...@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[3*(NONBONDED_WORK_GROUP_SIZE/2)];
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
...@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) int bufferIndex = 3*tgx;
tempBuffer[get_local_id(0)] = force; if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum;
} }
} }
else { else {
...@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) int bufferIndex = 3*tgx;
tempBuffer[get_local_id(0)] = force; if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; // Cheaper to load/store float4 than float3. Do both loads before both stores to minimize store-load waits.
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f); float4 sum1 = forceBuffers[offset1];
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz; float4 sum2 = forceBuffers[offset2];
sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
sum2 += (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
} }
} }
lasty = y; lasty = y;
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif #endif
#define TILE_SIZE 32 #define TILE_SIZE 32
#define WARPS_PER_GROUP (NONBONDED_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
float x, y, z; float x, y, z;
...@@ -21,7 +22,7 @@ __kernel void computeNonbonded( ...@@ -21,7 +22,7 @@ __kernel void computeNonbonded(
__global float4* restrict forceBuffers, __global float4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float* restrict tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
...@@ -40,6 +41,7 @@ __kernel void computeNonbonded( ...@@ -40,6 +41,7 @@ __kernel void computeNonbonded(
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; float energy = 0.0f;
__local float tempBuffer[3*NONBONDED_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange; __local int2* reservedBlocks = (__local int2*) exclusionRange;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment