Commit a5a7b902 authored by Peter Eastman's avatar Peter Eastman
Browse files

AMD support for 64 bit fixed point forces (currently disabled)

parent e62a6de6
......@@ -150,6 +150,9 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
}
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// Disable 64 bit atomics. A future version of the driver will support them, but until we can test that,
// it's safest not to use them.
supports64BitGlobalAtomics = false;
if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
/// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?;
......
......@@ -2397,6 +2397,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
......@@ -2445,6 +2447,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
......
......@@ -52,7 +52,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceBuffers = 2;
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
......@@ -63,12 +64,18 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
else {
numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = OpenCLContext::ThreadBlockSize;
numForceBuffers = numForceThreadBlocks;
if (numForceBuffers >= context.getNumAtomBlocks()) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
if (context.getSupports64BitGlobalAtomics()) {
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceBuffers = numForceThreadBlocks;
if (numForceBuffers >= context.getNumAtomBlocks()) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock = true;
numForceBuffers = context.getNumAtomBlocks();
forceBufferPerAtomBlock = true;
numForceBuffers = context.getNumAtomBlocks();
}
}
}
}
......
#define TILE_SIZE 32
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset1+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset2+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE];
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE];
#endif
/**
* Compute a force based on pair interactions.
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
......@@ -127,12 +139,19 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz + tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
#endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
STORE_DERIVATIVES_1
}
}
......@@ -208,15 +227,26 @@ void computeN2Energy(__global float4* restrict forceBuffers, __global float* res
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x+tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y+tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z+tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((local_force[get_local_id(0)].x+local_force[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].y+local_force[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].z+local_force[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
#endif
STORE_DERIVATIVES_1
STORE_DERIVATIVES_2
}
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
/**
......@@ -6,7 +10,13 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
#else
__global float* restrict global_value,
#endif
__local float* restrict local_value,
__local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
......@@ -122,12 +132,17 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
#endif
global_value[offset] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
}
}
else {
......@@ -198,15 +213,22 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_value[offset1], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
atom_add(&global_value[offset2], (long) ((local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset1] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE];
#endif
global_value[offset1] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)]+local_value[get_local_id(0)+TILE_SIZE];
}
}
lasty = y;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
typedef struct {
......@@ -10,7 +14,13 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
#else
__global float* restrict global_bornSum,
#endif
__global const float4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
......@@ -99,12 +109,17 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) ((bornSum + localTemp[tgx])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum + localTemp[tgx];
#endif
global_bornSum[offset] += bornSum+localTemp[tgx];
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
......@@ -177,12 +192,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset1], (long) ((bornSum + localTemp[tgx])*0xFFFFFFFF));
atom_add(&global_bornSum[offset2], (long) ((localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE])*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
......@@ -191,6 +212,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
......@@ -216,7 +238,12 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
......@@ -316,10 +343,17 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].temp_x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset];
......@@ -330,6 +364,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
global_sum += force.w + localData[tgx].temp_w;
forceBuffers[offset] = sum;
global_bornForce[offset] = global_sum;
#endif
}
// barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
}
......@@ -403,12 +438,25 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
barrier(CLK_LOCAL_MEM_FENCE);
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].temp_x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset1], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((localData[get_local_id(0)].fx + localForce[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fy + localForce[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fz + localForce[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset2], (long) ((localData[get_local_id(0)].fw + localForce[get_local_id(0)+TILE_SIZE].w)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1];
......@@ -427,6 +475,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
forceBuffers[offset2] = sum2;
global_bornForce[offset1] = global_sum1;
global_bornForce[offset2] = global_sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
......@@ -23,7 +28,13 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
......@@ -148,6 +159,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF));
#else
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
......@@ -160,6 +177,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float4 sum = forceBuffers[offset];
sum.xyz += force.xyz;
forceBuffers[offset] = sum;
#endif
}
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
}
......@@ -242,12 +260,22 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((localForce[tgx].x + localForce[tgx+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[tgx].y + localForce[tgx+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[tgx].z + localForce[tgx+TILE_SIZE].z)*0xFFFFFFFF));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1];
......@@ -260,6 +288,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment