Commit a5a7b902 authored by Peter Eastman's avatar Peter Eastman
Browse files

AMD support for 64 bit fixed point forces (currently disabled)

parent e62a6de6
...@@ -150,6 +150,9 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -150,6 +150,9 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
} }
} }
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") { else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// Disable 64 bit atomics. A future version of the driver will support them, but until we can test that,
// it's safest not to use them.
supports64BitGlobalAtomics = false;
if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) { if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
/// \todo Is 6 a good value for the OpenCL CPU device? /// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?; // numThreadBlocksPerComputeUnit = ?;
......
...@@ -2397,6 +2397,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2397,6 +2397,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
...@@ -2445,6 +2447,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2445,6 +2447,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
......
...@@ -52,7 +52,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -52,7 +52,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
if (context.getSupports64BitGlobalAtomics()) { if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256; forceThreadBlockSize = 256;
numForceBuffers = 2; // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
} }
else { else {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
...@@ -63,12 +64,18 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -63,12 +64,18 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
else { else {
numForceThreadBlocks = context.getNumThreadBlocks(); numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = OpenCLContext::ThreadBlockSize; forceThreadBlockSize = OpenCLContext::ThreadBlockSize;
numForceBuffers = numForceThreadBlocks; if (context.getSupports64BitGlobalAtomics()) {
if (numForceBuffers >= context.getNumAtomBlocks()) { // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp. numForceBuffers = 1;
}
else {
numForceBuffers = numForceThreadBlocks;
if (numForceBuffers >= context.getNumAtomBlocks()) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock = true; forceBufferPerAtomBlock = true;
numForceBuffers = context.getNumAtomBlocks(); numForceBuffers = context.getNumAtomBlocks();
}
} }
} }
} }
......
#define TILE_SIZE 32 #define TILE_SIZE 32
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset1+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset2+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE]; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE];
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE];
#endif
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force, void computeN2Energy(
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices, #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer, __global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
...@@ -127,12 +139,19 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res ...@@ -127,12 +139,19 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz + tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
#endif #endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
} }
} }
...@@ -208,15 +227,26 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res ...@@ -208,15 +227,26 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x+tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y+tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z+tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((local_force[get_local_id(0)].x+local_force[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].y+local_force[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].z+local_force[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
#endif
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32 #define TILE_SIZE 32
/** /**
...@@ -6,7 +10,13 @@ ...@@ -6,7 +10,13 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
#else
__global float* restrict global_value,
#endif
__local float* restrict local_value,
__local float* restrict tempBuffer, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
...@@ -122,12 +132,17 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -122,12 +132,17 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
tempBuffer[get_local_id(0)] = value; tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
#endif #endif
global_value[offset] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
} }
} }
else { else {
...@@ -198,15 +213,22 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -198,15 +213,22 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
tempBuffer[get_local_id(0)] = value; tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_value[offset1], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
atom_add(&global_value[offset2], (long) ((local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset1] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE];
#endif #endif
global_value[offset1] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)]+local_value[get_local_id(0)+TILE_SIZE];
} }
} }
lasty = y; lasty = y;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
...@@ -10,7 +14,13 @@ typedef struct { ...@@ -10,7 +14,13 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
#else
__global float* restrict global_bornSum,
#endif
__global const float4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -99,12 +109,17 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -99,12 +109,17 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localTemp[tgx] = bornSum; localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) ((bornSum + localTemp[tgx])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum + localTemp[tgx];
#endif #endif
global_bornSum[offset] += bornSum+localTemp[tgx];
} }
// barrier not required here as localTemp is not accessed before encountering another barrier. // barrier not required here as localTemp is not accessed before encountering another barrier.
} }
...@@ -177,12 +192,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -177,12 +192,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localTemp[tgx] = bornSum; localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset1], (long) ((bornSum + localTemp[tgx])*0xFFFFFFFF));
atom_add(&global_bornSum[offset2], (long) ((localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Do both loads before both stores to minimize store-load waits. // Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1]; float sum1 = global_bornSum[offset1];
...@@ -191,6 +212,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -191,6 +212,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE]; sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1; global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2; global_bornSum[offset2] = sum2;
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
...@@ -216,7 +238,12 @@ typedef struct { ...@@ -216,7 +238,12 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce, void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
...@@ -316,10 +343,17 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -316,10 +343,17 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].temp_x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset]; float4 sum = forceBuffers[offset];
...@@ -330,6 +364,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -330,6 +364,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
global_sum += force.w + localData[tgx].temp_w; global_sum += force.w + localData[tgx].temp_w;
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
global_bornForce[offset] = global_sum; global_bornForce[offset] = global_sum;
#endif
} }
// barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier. // barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
} }
...@@ -403,12 +438,25 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -403,12 +438,25 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
barrier(CLK_LOCAL_MEM_FENCE);
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].temp_x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset1], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((localData[get_local_id(0)].fx + localForce[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fy + localForce[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fz + localForce[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset2], (long) ((localData[get_local_id(0)].fw + localForce[get_local_id(0)+TILE_SIZE].w)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; float4 sum1 = forceBuffers[offset1];
...@@ -427,6 +475,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -427,6 +475,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
forceBuffers[offset2] = sum2; forceBuffers[offset2] = sum2;
global_bornForce[offset1] = global_sum1; global_bornForce[offset1] = global_sum1;
global_bornForce[offset2] = global_sum2; global_bornForce[offset2] = global_sum2;
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32 #define TILE_SIZE 32
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would // Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
...@@ -23,7 +28,13 @@ typedef struct { ...@@ -23,7 +28,13 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -148,6 +159,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -148,6 +159,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF));
#else
force.x += localData[tgx].fx; force.x += localData[tgx].fx;
force.y += localData[tgx].fy; force.y += localData[tgx].fy;
force.z += localData[tgx].fz; force.z += localData[tgx].fz;
...@@ -160,6 +177,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -160,6 +177,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float4 sum = forceBuffers[offset]; float4 sum = forceBuffers[offset];
sum.xyz += force.xyz; sum.xyz += force.xyz;
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
#endif
} }
// barrier not required here as localData[*].temp is not accessed before encountering another barrier. // barrier not required here as localData[*].temp is not accessed before encountering another barrier.
} }
...@@ -242,12 +260,22 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -242,12 +260,22 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((localForce[tgx].x + localForce[tgx+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[tgx].y + localForce[tgx+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[tgx].z + localForce[tgx+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; float4 sum1 = forceBuffers[offset1];
...@@ -260,6 +288,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -260,6 +288,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z; sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
forceBuffers[offset1] = sum1; forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2; forceBuffers[offset2] = sum2;
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment