"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "3862202e4d325d8a5238797b2907ecb640752b53"
Unverified Commit edbc8407 authored by peastman's avatar peastman Committed by GitHub
Browse files

Common compute framework to unify CUDA and OpenCL code (#2488)

* Began creating common compute framework to unify code between CUDA and OpenCL

* Began OpenCL implementation of common compute framework

* Common implementation of CMMotionRemover

* CUDA implementation of common compute interface

* Converted HarmonicBondForce to common compute API

* Converted standard bonded forces to common compute API

* Converted ExpressionUtilities to common compute API

* Created ComputeParameterSet

* Converted custom bonded forces to common compute API

* Converted CustomCentroidBondForce to common compute API

* Converted CustomManyParticleForce to common compute API

* Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext

* Converted GayBerneForce to common compute API

* Removed obsolete kernels

* Converted verlet integrators to common compute API

* Converted Langevin and Brownian integrators to common compute API

* Converted CustomIntegrator to common compute API

* Converted CustomNonbondedForce to common compute API

* Removed uses of a deprecated API

* Fixed failing test cases

* Converted GBSAOBCForce to common compute API

* Began converting CustomGBForce to common compute API

* Finished converting CustomGBForce to common compute API

* Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities

* Converted RMSDForce and AndersenThermostat to common compute API

* Converted CustomHbondForce to common compute API

* Merged scripts for encoding kernel sources

* Converted Drude plugin to common compute API

* Fixed errors in CMake scripts

* Attempt at fixing errors on Windows

* Added discussion of common compute API to developer guide

* Added Windows export macro for common classes

* Fixed error in CMMotionRemover

* Ubdated travis to newer Ubuntu version

* Fixed errors on CPU OpenCL

* Fixed Windows linking errors

* Added missing pragma for 32 bit atomics

* Replaced long long with mm_long

* More fixes to Windows linking

* Bug fix
parent 38beeefe
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000)); #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[LOCAL_ID]*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
#else #else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
#endif #endif
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel void computeN2Energy( KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else #else
__global real4* restrict forceBuffers, GLOBAL real4* RESTRICT forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __local real4* restrict local_force, GLOBAL mixed* RESTRICT energyBuffer,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
__global const ushort2* exclusionTiles, int needEnergy, GLOBAL const ushort2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real4 force = 0; real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx]; unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
...@@ -53,14 +55,14 @@ __kernel void computeN2Energy( ...@@ -53,14 +55,14 @@ __kernel void computeN2Energy(
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
local_posq[localAtomIndex] = posq1; local_pos[localAtomIndex] = pos1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -84,8 +86,10 @@ __kernel void computeN2Energy( ...@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
} }
if (needEnergy) if (needEnergy)
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -98,11 +102,11 @@ __kernel void computeN2Energy( ...@@ -98,11 +102,11 @@ __kernel void computeN2Energy(
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0; local_force[localAtomIndex] = make_real3(0);
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
SYNC_WARPS; SYNC_WARPS;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -111,8 +115,8 @@ __kernel void computeN2Energy( ...@@ -111,8 +115,8 @@ __kernel void computeN2Energy(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -126,7 +130,7 @@ __kernel void computeN2Energy( ...@@ -126,7 +130,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f; const real interactionScale = 1;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
...@@ -136,10 +140,12 @@ __kernel void computeN2Energy( ...@@ -136,10 +140,12 @@ __kernel void computeN2Energy(
} }
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj; atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz; local_force[atom2] += delta;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
...@@ -151,20 +157,20 @@ __kernel void computeN2Energy( ...@@ -151,20 +157,20 @@ __kernel void computeN2Energy(
SYNC_WARPS; SYNC_WARPS;
} }
} }
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
#else #else
...@@ -175,7 +181,7 @@ __kernel void computeN2Energy( ...@@ -175,7 +181,7 @@ __kernel void computeN2Energy(
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
if (x != y) { if (x != y) {
offset = offset2; offset = offset2;
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f); forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
#endif #endif
...@@ -188,21 +194,21 @@ __kernel void computeN2Energy( ...@@ -188,21 +194,21 @@ __kernel void computeN2Energy(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
const bool isExcluded = false; const bool isExcluded = false;
real4 force = 0; real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
bool includeTile = true; bool includeTile = true;
...@@ -231,10 +237,10 @@ __kernel void computeN2Energy( ...@@ -231,10 +237,10 @@ __kernel void computeN2Energy(
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -247,20 +253,20 @@ __kernel void computeN2Energy( ...@@ -247,20 +253,20 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile. // Load atom data for this tile.
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx]; unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0; local_force[localAtomIndex] = make_real3(0);
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
} }
SYNC_WARPS; SYNC_WARPS;
...@@ -270,14 +276,14 @@ __kernel void computeN2Energy( ...@@ -270,14 +276,14 @@ __kernel void computeN2Energy(
// box, then skip having to apply periodic boundary conditions later. // box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -286,17 +292,19 @@ __kernel void computeN2Energy( ...@@ -286,17 +292,19 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj]; atom2 = atomIndices[tbx+tj];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f; const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
} }
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj; atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz; local_force[atom2] += delta;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
...@@ -311,8 +319,8 @@ __kernel void computeN2Energy( ...@@ -311,8 +319,8 @@ __kernel void computeN2Energy(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -326,17 +334,19 @@ __kernel void computeN2Energy( ...@@ -326,17 +334,19 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj]; atom2 = atomIndices[tbx+tj];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f; const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
} }
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj; atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz; local_force[atom2] += delta;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
...@@ -347,22 +357,22 @@ __kernel void computeN2Energy( ...@@ -347,22 +357,22 @@ __kernel void computeN2Energy(
} }
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
unsigned int offset = atom1; unsigned int offset = atom1;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (local_force[get_local_id(0)].x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
offset = atom2; offset = atom2;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
...@@ -373,7 +383,7 @@ __kernel void computeN2Energy( ...@@ -373,7 +383,7 @@ __kernel void computeN2Energy(
unsigned int offset = offset1; unsigned int offset = offset1;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f); forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
offset = offset2; offset = offset2;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
...@@ -381,6 +391,6 @@ __kernel void computeN2Energy( ...@@ -381,6 +391,6 @@ __kernel void computeN2Energy(
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS SAVE_PARAM_DERIVS
} }
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000)); #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[tgx]*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[tgx]*0x100000000));
#else #else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
...@@ -10,30 +9,33 @@ ...@@ -10,30 +9,33 @@
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel void computeN2Energy( KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else #else
__global real4* restrict forceBuffers, GLOBAL real4* RESTRICT forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __local real4* restrict local_force, GLOBAL mixed* RESTRICT energyBuffer,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
__global const ushort2* exclusionTiles, int needEnergy, GLOBAL const ushort2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -43,7 +45,7 @@ __kernel void computeN2Energy( ...@@ -43,7 +45,7 @@ __kernel void computeN2Energy(
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex; unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
if (x == y) { if (x == y) {
...@@ -56,15 +58,15 @@ __kernel void computeN2Energy( ...@@ -56,15 +58,15 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta, delta);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
...@@ -84,8 +86,10 @@ __kernel void computeN2Energy( ...@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
dEdR /= -r; dEdR /= -r;
} }
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -98,12 +102,12 @@ __kernel void computeN2Energy( ...@@ -98,12 +102,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#endif #endif
...@@ -123,11 +127,11 @@ __kernel void computeN2Energy( ...@@ -123,11 +127,11 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -153,8 +157,10 @@ __kernel void computeN2Energy( ...@@ -153,8 +157,10 @@ __kernel void computeN2Energy(
dEdR /= -r; dEdR /= -r;
} }
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j; atom2 = j;
local_force[atom2].xyz += delta.xyz; local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
...@@ -170,12 +176,12 @@ __kernel void computeN2Energy( ...@@ -170,12 +176,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#endif #endif
...@@ -186,12 +192,12 @@ __kernel void computeN2Energy( ...@@ -186,12 +192,12 @@ __kernel void computeN2Energy(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx; unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz; forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#endif #endif
...@@ -206,15 +212,15 @@ __kernel void computeN2Energy( ...@@ -206,15 +212,15 @@ __kernel void computeN2Energy(
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
const bool isExcluded = false; const bool isExcluded = false;
...@@ -261,7 +267,7 @@ __kernel void computeN2Energy( ...@@ -261,7 +267,7 @@ __kernel void computeN2Energy(
#endif #endif
atomIndices[localAtomIndex] = j; atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0; local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
...@@ -274,17 +280,17 @@ __kernel void computeN2Energy( ...@@ -274,17 +280,17 @@ __kernel void computeN2Energy(
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -298,8 +304,10 @@ __kernel void computeN2Energy( ...@@ -298,8 +304,10 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j; atom2 = j;
local_force[atom2].xyz += delta.xyz; local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
...@@ -310,12 +318,12 @@ __kernel void computeN2Energy( ...@@ -310,12 +318,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#endif #endif
...@@ -330,11 +338,11 @@ __kernel void computeN2Energy( ...@@ -330,11 +338,11 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -355,10 +363,12 @@ __kernel void computeN2Energy( ...@@ -355,10 +363,12 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j; atom2 = j;
local_force[atom2].xyz += delta.xyz; local_force[atom2] += delta;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
} }
} }
...@@ -367,12 +377,12 @@ __kernel void computeN2Energy( ...@@ -367,12 +377,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#endif #endif
...@@ -389,13 +399,13 @@ __kernel void computeN2Energy( ...@@ -389,13 +399,13 @@ __kernel void computeN2Energy(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (local_force[tgx].x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
unsigned int offset = atom2; unsigned int offset = atom2;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz; forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#endif #endif
...@@ -404,6 +414,6 @@ __kernel void computeN2Energy( ...@@ -404,6 +414,6 @@ __kernel void computeN2Energy(
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS SAVE_PARAM_DERIVS
} }
...@@ -9,24 +9,29 @@ ...@@ -9,24 +9,29 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms. * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/ */
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
unsigned int index = get_global_id(0); for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) {
// Reduce the derivatives // Reduce the derivatives
#ifndef SUPPORTS_64_BIT_ATOMICS
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
#endif
REDUCE_DERIVATIVES REDUCE_DERIVATIVES
// Now calculate the per-particle energy terms. // Now calculate the per-particle energy terms.
real4 pos = posq[index]; real4 pos = posq[index];
real4 force = (real4) 0; real3 force = make_real3(0, 0, 0);
COMPUTE_ENERGY COMPUTE_ENERGY
index += get_global_size(0);
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS SAVE_PARAM_DERIVS
} }
...@@ -2,17 +2,30 @@ ...@@ -2,17 +2,30 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates. * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/ */
extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers
#endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
const real scale = RECIP((real) 0x100000000); const real scale = RECIP((real) 0x100000000);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
real4 pos = posq[index]; real4 pos = posq[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]); real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
#else
real3 force = trimTo3(forceBuffers[index]);
#endif
COMPUTE_FORCES COMPUTE_FORCES
forceBuffers[index] = (long long) (force.x*0x100000000); #ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000); forceBuffers[index] = (mm_long) (force.x*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000); forceBuffers[index+PADDED_NUM_ATOMS] = (mm_long) (force.y*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (mm_long) (force.z*0x100000000);
#else
forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
#endif
} }
SAVE_PARAM_DERIVS SAVE_PARAM_DERIVS
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
/** /**
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
__global const ushort2* exclusionTiles, GLOBAL const ushort2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else #else
__global real* restrict global_value, GLOBAL real* RESTRICT global_value,
#endif #endif
__local real* restrict local_value,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real local_value[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real value = 0; real value = 0;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx]; unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
...@@ -44,14 +42,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -44,14 +42,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
local_posq[localAtomIndex] = posq1; local_pos[localAtomIndex] = pos1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -87,9 +85,9 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -87,9 +85,9 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0; local_value[localAtomIndex] = 0;
SYNC_WARPS; SYNC_WARPS;
...@@ -99,8 +97,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -99,8 +97,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -141,11 +139,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -141,11 +139,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = x*TILE_SIZE + tgx; unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
if (x != y) { if (x != y) {
unsigned int offset2 = y*TILE_SIZE + tgx; unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000)); ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#else #else
...@@ -154,7 +152,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -154,7 +152,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
global_value[offset1] += value; global_value[offset1] += value;
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
if (x != y) { if (x != y) {
global_value[offset2] += local_value[get_local_id(0)]; global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#endif #endif
...@@ -167,17 +165,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -167,17 +165,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
real value = 0; real value = 0;
...@@ -208,10 +206,10 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -208,10 +206,10 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -225,17 +223,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -225,17 +223,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// Load atom data for this tile. // Load atom data for this tile.
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = LOCAL_ID;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx]; unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0; local_value[localAtomIndex] = 0;
} }
...@@ -246,14 +244,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -246,14 +244,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// box, then skip having to apply periodic boundary conditions later. // box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -278,12 +276,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -278,12 +276,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif #endif
{ {
// We need to apply periodic boundary conditions separately for each interaction. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2]; real3 pos2 = local_pos[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -313,19 +311,19 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -313,19 +311,19 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
} }
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2; unsigned int offset2 = atom2;
atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000)); ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#else #else
...@@ -334,7 +332,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -334,7 +332,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[get_local_id(0)]; global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#endif #endif
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
/** /**
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
__global const ushort2* exclusionTiles, GLOBAL const ushort2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else #else
__global real* restrict global_value, GLOBAL real* RESTRICT global_value,
#endif #endif
__local real* restrict local_value,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real local_value[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -35,7 +33,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -35,7 +33,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex; unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
if (x == y) { if (x == y) {
...@@ -47,11 +45,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -47,11 +45,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0; real value = 0;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -88,7 +86,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -88,7 +86,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else #else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value; global_value[offset1] += value;
...@@ -107,11 +105,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -107,11 +105,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0; real value = 0;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -150,7 +148,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -150,7 +148,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else #else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value; global_value[offset1] += value;
...@@ -163,7 +161,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -163,7 +161,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = y*TILE_SIZE+tgx; unsigned int offset2 = y*TILE_SIZE+tgx;
atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000)); ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
#else #else
unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx]; global_value[offset2] += local_value[tgx];
...@@ -180,15 +178,15 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -180,15 +178,15 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (get_group_id(0)*(mm_long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((get_group_id(0)+1)*(mm_long)numTiles/get_num_groups(0));
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
bool includeTile = true; bool includeTile = true;
...@@ -234,7 +232,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -234,7 +232,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif #endif
atomIndices[localAtomIndex] = j; atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j]; local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0; local_value[localAtomIndex] = 0;
} }
...@@ -246,16 +244,16 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -246,16 +244,16 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0; real value = 0;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -277,7 +275,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -277,7 +275,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else #else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value; global_value[offset1] += value;
...@@ -293,11 +291,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -293,11 +291,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0; real value = 0;
real4 posq1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j]; real3 pos2 = local_pos[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -326,7 +324,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -326,7 +324,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else #else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value; global_value[offset1] += value;
...@@ -346,7 +344,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -346,7 +344,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = atom2; unsigned int offset2 = atom2;
atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000)); ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
#else #else
unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx]; global_value[offset2] += local_value[tgx];
......
...@@ -2,19 +2,18 @@ ...@@ -2,19 +2,18 @@
* Reduce a pairwise computed value, and compute per-particle values. * Reduce a pairwise computed value, and compute per-particle values.
*/ */
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global real4* posq, KERNEL void computePerParticleValues(GLOBAL real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* valueBuffers GLOBAL mm_long* valueBuffers
#else #else
__global real* valueBuffers GLOBAL real* valueBuffers, int bufferSize, int numBuffers
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0); for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) {
// Reduce the pairwise value // Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
real sum = (1.0f/0x100000000)*valueBuffers[index]; real sum = valueBuffers[index]/(real) 0x100000000;
#else #else
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
real sum = valueBuffers[index]; real sum = valueBuffers[index];
...@@ -27,6 +26,5 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global ...@@ -27,6 +26,5 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global
real4 pos = posq[index]; real4 pos = posq[index];
COMPUTE_VALUES COMPUTE_VALUES
index += get_global_size(0);
} }
} }
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { inline DEVICE real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result) APPLY_PERIODIC_TO_DELTA(result)
#endif #endif
...@@ -14,73 +14,79 @@ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxS ...@@ -14,73 +14,79 @@ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxS
/** /**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude. * Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/ */
real computeAngle(real4 vec1, real4 vec2) { inline DEVICE real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z; real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w); real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle; real angle;
if (cosine > 0.99f || cosine < -0.99f) { if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 crossProduct = cross(vec1, vec2); real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
real scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0)
angle = PI-angle; angle = M_PI-angle;
} }
else else
angle = acos(cosine); angle = ACOS(cosine);
return angle; return angle;
} }
/** /**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude. * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/ */
real4 computeCross(real4 vec1, real4 vec2) { inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2); real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
result.w = result.x*result.x + result.y*result.y + result.z*result.z; return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
return result;
} }
/** /**
* Compute forces on donors. * Compute forces on donors.
*/ */
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions, KERNEL void computeDonorForces(
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize, #ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
mixed energy = 0; mixed energy = 0;
real4 f1 = (real4) 0; real3 f1 = make_real3(0);
real4 f2 = (real4) 0; real3 f2 = make_real3(0);
real4 f3 = (real4) 0; real3 f3 = make_real3(0);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) { for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += GLOBAL_SIZE) {
// Load information about the donor this thread will compute forces on. // Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+get_global_id(0); int donorIndex = donorStart+GLOBAL_ID;
int4 atoms, exclusionIndices; int4 atoms, exclusionIndices;
real4 d1, d2, d3; real4 d1, d2, d3;
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex]; atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0); d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0); d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0); d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex]; exclusionIndices = exclusions[donorIndex];
#endif #endif
} }
else else
atoms = (int4) (-1, -1, -1, -1); atoms = make_int4(-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_local_size(0)) { for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += LOCAL_SIZE) {
// Load the next block of acceptors into local memory. // Load the next block of acceptors into local memory.
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart); int blockSize = min((int) LOCAL_SIZE, NUM_ACCEPTORS-acceptorStart);
if (get_local_id(0) < blockSize) { if (LOCAL_ID < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)]; int4 atoms2 = acceptorAtoms[acceptorStart+LOCAL_ID];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0); posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0); posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0); posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
} }
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
int acceptorIndex = acceptorStart+index; int acceptorIndex = acceptorStart+index;
...@@ -108,6 +114,26 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global ...@@ -108,6 +114,26 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
// Write results // Write results
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
MEM_FENCE;
}
#else
int4 bufferIndices = donorBufferIndices[donorIndex]; int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) { if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS; unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
...@@ -127,49 +153,57 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global ...@@ -127,49 +153,57 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
force.xyz += f3.xyz; force.xyz += f3.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
#endif
} }
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/** /**
* Compute forces on acceptors. * Compute forces on acceptors.
*/ */
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions, KERNEL void computeAcceptorForces(
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize, #ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real4 f1 = (real4) 0; LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
real4 f2 = (real4) 0; real3 f1 = make_real3(0);
real4 f3 = (real4) 0; real3 f2 = make_real3(0);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) { real3 f3 = make_real3(0);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += GLOBAL_SIZE) {
// Load information about the acceptor this thread will compute forces on. // Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+get_global_id(0); int acceptorIndex = acceptorStart+GLOBAL_ID;
int4 atoms, exclusionIndices; int4 atoms, exclusionIndices;
real4 a1, a2, a3; real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex]; atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0); a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0); a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0); a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex]; exclusionIndices = exclusions[acceptorIndex];
#endif #endif
} }
else else
atoms = (int4) (-1, -1, -1, -1); atoms = make_int4(-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_local_size(0)) { for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += LOCAL_SIZE) {
// Load the next block of donors into local memory. // Load the next block of donors into local memory.
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart); int blockSize = min((int) LOCAL_SIZE, NUM_DONORS-donorStart);
if (get_local_id(0) < blockSize) { if (LOCAL_ID < blockSize) {
int4 atoms2 = donorAtoms[donorStart+get_local_id(0)]; int4 atoms2 = donorAtoms[donorStart+LOCAL_ID];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0); posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0); posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0); posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
} }
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) { for (int index = 0; index < blockSize; index++) {
int donorIndex = donorStart+index; int donorIndex = donorStart+index;
...@@ -197,6 +231,26 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo ...@@ -197,6 +231,26 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
// Write results // Write results
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
MEM_FENCE;
}
#else
int4 bufferIndices = acceptorBufferIndices[acceptorIndex]; int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) { if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS; unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
...@@ -216,6 +270,7 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo ...@@ -216,6 +270,7 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
force.xyz += f3.xyz; force.xyz += f3.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
#endif
} }
} }
} }
extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, float* result) { KERNEL void computeFloatSum(GLOBAL const float* RESTRICT sumBuffer, GLOBAL float* result, int bufferSize) {
__shared__ float tempBuffer[WORK_GROUP_SIZE]; LOCAL float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x; const unsigned int thread = LOCAL_ID;
float sum = 0; float sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x) for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
sum += sumBuffer[index]; sum += sumBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) { for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
__syncthreads(); SYNC_THREADS;
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE) if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i]; tempBuffer[thread] += tempBuffer[thread+i];
} }
...@@ -14,24 +14,26 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, ...@@ -14,24 +14,26 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
*result = tempBuffer[0]; *result = tempBuffer[0];
} }
extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) { #ifdef SUPPORTS_DOUBLE_PRECISION
__shared__ double tempBuffer[WORK_GROUP_SIZE]; KERNEL void computeDoubleSum(GLOBAL const double* RESTRICT sumBuffer, GLOBAL double* result, int bufferSize) {
const unsigned int thread = threadIdx.x; LOCAL double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = LOCAL_ID;
double sum = 0; double sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x) for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
sum += sumBuffer[index]; sum += sumBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) { for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
__syncthreads(); SYNC_THREADS;
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE) if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i]; tempBuffer[thread] += tempBuffer[thread+i];
} }
if (thread == 0) if (thread == 0)
*result = tempBuffer[0]; *result = tempBuffer[0];
} }
#endif
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) { KERNEL void applyPositionDeltas(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index]; real4 pos2 = posqCorrection[index];
...@@ -48,14 +50,14 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* ...@@ -48,14 +50,14 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
#else #else
posq[index] = pos; posq[index] = pos;
#endif #endif
posDelta[index] = make_mixed4(0, 0, 0, 0); posDelta[index] = make_mixed4(0);
} }
} }
extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) { KERNEL void generateRandomNumbers(int numValues, GLOBAL float4* RESTRICT random, GLOBAL uint4* RESTRICT seed) {
uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x]; uint4 state = seed[GLOBAL_ID];
unsigned int carry = 0; unsigned int carry = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numValues; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < numValues; index += GLOBAL_SIZE) {
// Generate three uniform random numbers. // Generate three uniform random numbers.
state.x = state.x * 69069 + 1; state.x = state.x * 69069 + 1;
...@@ -93,5 +95,5 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -93,5 +95,5 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
random[index] = make_float4(x1, x2, x3, 0.0f); random[index] = make_float4(x1, x2, x3, 0.0f);
} }
seed[blockIdx.x*blockDim.x+threadIdx.x] = state; seed[GLOBAL_ID] = state;
} }
#ifdef SUPPORTS_DOUBLE_PRECISION
typedef double TempType;
typedef double3 TempType3;
typedef double4 TempType4;
#define make_TempType3(a...) make_double3(a)
#define make_TempType4(a...) make_double4(a)
#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
inline DEVICE mixed4 convertFromDouble4(double4 a) {
return make_mixed4(a.x, a.y, a.z, a.w);
}
#else
typedef float TempType;
typedef float3 TempType3;
typedef float4 TempType4;
#define make_TempType3(a...) make_float3(a)
#define make_TempType4(a...) make_float4(a)
#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
#endif
/**
* Load the position of a particle.
*/
inline DEVICE TempType4 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return make_TempType4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return convertToTempType4(posq[index]);
#endif
}
/**
* Store the position of a particle.
*/
inline DEVICE void storePos(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, int index, TempType4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = make_real4(pos.x, pos.y, pos.z, pos.w);
#endif
}
KERNEL void computePerDof(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta,
GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed2* RESTRICT dt, GLOBAL const mixed* RESTRICT globals,
GLOBAL mixed* RESTRICT sum, GLOBAL const float4* RESTRICT gaussianValues, unsigned int gaussianBaseIndex, GLOBAL const float4* RESTRICT uniformValues,
const mixed energy, GLOBAL mixed* RESTRICT energyParamDerivs
PARAMETER_ARGUMENTS) {
TempType3 stepSize = make_TempType3(dt[0].y);
int index = GLOBAL_ID;
const TempType forceScale = ((TempType) 1)/0xFFFFFFFF;
while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA
TempType4 position = loadPos(posq, posqCorrection, index) + convertToTempType4(posDelta[index]);
#else
TempType4 position = loadPos(posq, posqCorrection, index);
#endif
TempType4 velocity = convertToTempType4(velm[index]);
TempType3 f = make_TempType3(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2]);
TempType3 mass = make_TempType3(RECIP(velocity.w));
if (velocity.w != 0.0) {
int gaussianIndex = gaussianBaseIndex;
int uniformIndex = 0;
COMPUTE_STEP
}
index += GLOBAL_SIZE;
}
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
/** /**
* Record the force on an atom to global memory. * Record the force on an atom to global memory.
*/ */
inline void storeForce(int atom, real4 force, __global long* restrict forceBuffers) { inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
atom_add(&forceBuffers[atom], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
} }
/** /**
* Compute the difference between two vectors, taking periodic boundary conditions into account * Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result) APPLY_PERIODIC_TO_DELTA(result)
#endif #endif
...@@ -26,36 +23,36 @@ inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPerio ...@@ -26,36 +23,36 @@ inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPerio
/** /**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude. * Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/ */
real computeAngle(real4 vec1, real4 vec2) { DEVICE real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z; real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w); real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle; real angle;
if (cosine > 0.99f || cosine < -0.99f) { if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 crossProduct = cross(vec1, vec2); real3 crossProduct = trimTo3(cross(vec1, vec2));
real scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0.0f)
angle = M_PI-angle; angle = M_PI-angle;
} }
else else
angle = acos(cosine); angle = ACOS(cosine);
return angle; return angle;
} }
/** /**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude. * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/ */
inline real4 computeCross(real4 vec1, real4 vec2) { inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
real4 cp = cross(vec1, vec2); real3 cp = trimTo3(cross(vec1, vec2));
return (real4) (cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z); return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
} }
/** /**
* Determine whether a particular interaction is in the list of exclusions. * Determine whether a particular interaction is in the list of exclusions.
*/ */
inline bool isInteractionExcluded(int atom1, int atom2, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) { inline DEVICE bool isInteractionExcluded(int atom1, int atom2, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
if (atom1 > atom2) { if (atom1 > atom2) {
int temp = atom1; int temp = atom1;
atom1 = atom2; atom1 = atom2;
...@@ -76,24 +73,24 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global const int* rest ...@@ -76,24 +73,24 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global const int* rest
/** /**
* Compute the interaction. * Compute the interaction.
*/ */
__kernel void computeInteraction( KERNEL void computeInteraction(
__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, __global const int* restrict neighbors, __global const int* restrict neighborStartIndex , GLOBAL const int* RESTRICT neighbors, GLOBAL const int* RESTRICT neighborStartIndex
#endif #endif
#ifdef USE_FILTERS #ifdef USE_FILTERS
, __global int* restrict particleTypes, __global int* restrict orderIndex, __global int* restrict particleOrder , GLOBAL int* RESTRICT particleTypes, GLOBAL int* RESTRICT orderIndex, GLOBAL int* RESTRICT particleOrder
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
, __global int* restrict exclusions, __global int* restrict exclusionStartIndex , GLOBAL int* RESTRICT exclusions, GLOBAL int* RESTRICT exclusionStartIndex
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
// Loop over particles to be the first one in the set. // Loop over particles to be the first one in the set.
for (int p1 = get_group_id(0); p1 < NUM_ATOMS; p1 += get_num_groups(0)) { for (int p1 = GROUP_ID; p1 < NUM_ATOMS; p1 += NUM_GROUPS) {
#ifdef USE_CENTRAL_PARTICLE #ifdef USE_CENTRAL_PARTICLE
const int a1 = p1; const int a1 = p1;
#else #else
...@@ -110,7 +107,7 @@ __kernel void computeInteraction( ...@@ -110,7 +107,7 @@ __kernel void computeInteraction(
#endif #endif
#endif #endif
int numCombinations = NUM_CANDIDATE_COMBINATIONS; int numCombinations = NUM_CANDIDATE_COMBINATIONS;
for (int index = get_local_id(0); index < numCombinations; index += get_local_size(0)) { for (int index = LOCAL_ID; index < numCombinations; index += LOCAL_SIZE) {
FIND_ATOMS_FOR_COMBINATION_INDEX; FIND_ATOMS_FOR_COMBINATION_INDEX;
bool includeInteraction = IS_VALID_COMBINATION; bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -135,15 +132,15 @@ __kernel void computeInteraction( ...@@ -135,15 +132,15 @@ __kernel void computeInteraction(
} }
} }
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
__kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict numNeighborPairs) { GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {
int index = get_global_id(0); int index = GLOBAL_ID;
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < NUM_ATOMS) { while (base < NUM_ATOMS) {
real4 pos = posq[base]; real4 pos = posq[base];
...@@ -159,37 +156,39 @@ __kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, r ...@@ -159,37 +156,39 @@ __kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, r
real4 center = 0.5f*(maxPos+minPos); real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center) APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif #endif
minPos = (real4) (min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0); minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = (real4) (max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
} }
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = 0.5f*(maxPos+minPos);
index += get_global_size(0); index += GLOBAL_SIZE;
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
if (get_group_id(0) == 0 && get_local_id(0) == 0) if (GROUP_ID == 0 && LOCAL_ID == 0)
*numNeighborPairs = 0; *numNeighborPairs = 0;
} }
/** /**
* Find a list of neighbors for each atom. * Find a list of neighbors for each atom.
*/ */
__kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global const real4* restrict blockCenter, __global const real4* restrict blockBoundingBox, __global int2* restrict neighborPairs, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox, GLOBAL int2* RESTRICT neighborPairs,
__global int* restrict numNeighborPairs, __global int* restrict numNeighborsForAtom, int maxNeighborPairs GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex , GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex
#endif #endif
) { ) {
__local real4 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE]; LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
__local bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE]; int indexInWarp = LOCAL_ID%32;
int indexInWarp = get_local_id(0)%32; #ifndef __CUDA_ARCH__
int warpStart = get_local_id(0)-indexInWarp; LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
for (int atom1 = get_global_id(0); atom1 < PADDED_NUM_ATOMS; atom1 += get_global_size(0)) { int warpStart = LOCAL_ID-indexInWarp;
#endif
for (int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
// Load data for this atom. Note that all threads in a warp are processing atoms from the same block. // Load data for this atom. Note that all threads in a warp are processing atoms from the same block.
real4 pos1 = posq[atom1]; real3 pos1 = trimTo3(posq[atom1]);
int block1 = atom1/TILE_SIZE; int block1 = atom1/TILE_SIZE;
real4 blockCenter1 = blockCenter[block1]; real4 blockCenter1 = blockCenter[block1];
real4 blockSize1 = blockBoundingBox[block1]; real4 blockSize1 = blockBoundingBox[block1];
...@@ -221,10 +220,18 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea ...@@ -221,10 +220,18 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// Loop over any blocks we identified as potentially containing neighbors. // Loop over any blocks we identified as potentially containing neighbors.
includeBlockFlags[get_local_id(0)] = includeBlock2; #ifdef __CUDA_ARCH__
int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
{
#else
includeBlockFlags[LOCAL_ID] = includeBlock2;
SYNC_WARPS; SYNC_WARPS;
for (int i = 0; i < TILE_SIZE; i++) { for (int i = 0; i < TILE_SIZE; i++) {
if (includeBlockFlags[warpStart+i]) { if (includeBlockFlags[warpStart+i]) {
#endif
int block2 = block2Base+i; int block2 = block2Base+i;
// Loop over atoms in this block. // Loop over atoms in this block.
...@@ -233,12 +240,12 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea ...@@ -233,12 +240,12 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int included[TILE_SIZE]; int included[TILE_SIZE];
int numIncluded = 0; int numIncluded = 0;
SYNC_WARPS; SYNC_WARPS;
positionCache[get_local_id(0)] = posq[start+indexInWarp]; positionCache[LOCAL_ID] = trimTo3(posq[start+indexInWarp]);
SYNC_WARPS; SYNC_WARPS;
if (atom1 < NUM_ATOMS) { if (atom1 < NUM_ATOMS) {
for (int j = 0; j < 32; j++) { for (int j = 0; j < 32; j++) {
int atom2 = start+j; int atom2 = start+j;
real4 pos2 = positionCache[get_local_id(0)-indexInWarp+j]; real3 pos2 = positionCache[LOCAL_ID-indexInWarp+j];
// Decide whether to include this atom pair in the neighbor list. // Decide whether to include this atom pair in the neighbor list.
...@@ -260,10 +267,10 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea ...@@ -260,10 +267,10 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// If we found any neighbors, store them to the neighbor list. // If we found any neighbors, store them to the neighbor list.
if (numIncluded > 0) { if (numIncluded > 0) {
int baseIndex = atom_add(numNeighborPairs, numIncluded); int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
if (baseIndex+numIncluded <= maxNeighborPairs) if (baseIndex+numIncluded <= maxNeighborPairs)
for (int j = 0; j < numIncluded; j++) for (int j = 0; j < numIncluded; j++)
neighborPairs[baseIndex+j] = (int2) (atom1, included[j]); neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
totalNeighborsForAtom1 += numIncluded; totalNeighborsForAtom1 += numIncluded;
} }
} }
...@@ -279,59 +286,59 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea ...@@ -279,59 +286,59 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
* Sum the neighbor counts to compute the start position of each atom. This kernel * Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeNeighborStartIndices(__global int* restrict numNeighborsForAtom, __global int* restrict neighborStartIndex, KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
__global int* restrict numNeighborPairs, int maxNeighborPairs) { GLOBAL int* RESTRICT numNeighborPairs, int maxNeighborPairs) {
__local unsigned int posBuffer[256]; LOCAL unsigned int posBuffer[256];
if (*numNeighborPairs > maxNeighborPairs) { if (*numNeighborPairs > maxNeighborPairs) {
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start // There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start
// indices to indicate no neighbors for any atom. // indices to indicate no neighbors for any atom.
for (int i = get_local_id(0); i <= NUM_ATOMS; i += get_local_size(0)) for (int i = LOCAL_ID; i <= NUM_ATOMS; i += LOCAL_SIZE)
neighborStartIndex[i] = 0; neighborStartIndex[i] = 0;
return; return;
} }
unsigned int globalOffset = 0; unsigned int globalOffset = 0;
for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += get_local_size(0)) { for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += LOCAL_SIZE) {
// Load the neighbor counts into local memory. // Load the neighbor counts into local memory.
unsigned int globalIndex = startAtom+get_local_id(0); unsigned int globalIndex = startAtom+LOCAL_ID;
posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0); posBuffer[LOCAL_ID] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
// Perform a parallel prefix sum. // Perform a parallel prefix sum.
for (unsigned int step = 1; step < get_local_size(0); step *= 2) { for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
unsigned int add = (get_local_id(0) >= step ? posBuffer[get_local_id(0)-step] : 0); unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID-step] : 0);
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
posBuffer[get_local_id(0)] += add; posBuffer[LOCAL_ID] += add;
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
} }
// Write the results back to global memory. // Write the results back to global memory.
if (globalIndex < NUM_ATOMS) { if (globalIndex < NUM_ATOMS) {
neighborStartIndex[globalIndex+1] = posBuffer[get_local_id(0)]+globalOffset; neighborStartIndex[globalIndex+1] = posBuffer[LOCAL_ID]+globalOffset;
numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
} }
globalOffset += posBuffer[get_local_size(0)-1]; globalOffset += posBuffer[LOCAL_SIZE-1];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
} }
if (get_local_id(0) == 0) if (LOCAL_ID == 0)
neighborStartIndex[0] = 0; neighborStartIndex[0] = 0;
} }
/** /**
* Assemble the final neighbor list. * Assemble the final neighbor list.
*/ */
__kernel void copyPairsToNeighborList(__global const int2* restrict neighborPairs, __global int* restrict neighbors, __global int* restrict numNeighborPairs, KERNEL void copyPairsToNeighborList(GLOBAL const int2* RESTRICT neighborPairs, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT numNeighborPairs,
int maxNeighborPairs, __global int* restrict numNeighborsForAtom, __global const int* restrict neighborStartIndex) { int maxNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL const int* RESTRICT neighborStartIndex) {
int actualPairs = *numNeighborPairs; int actualPairs = *numNeighborPairs;
if (actualPairs > maxNeighborPairs) if (actualPairs > maxNeighborPairs)
return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it. return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for (unsigned int index = get_global_id(0); index < actualPairs; index += get_global_size(0)) { for (unsigned int index = GLOBAL_ID; index < actualPairs; index += GLOBAL_SIZE) {
int2 pair = neighborPairs[index]; int2 pair = neighborPairs[index];
int startIndex = neighborStartIndex[pair.x]; int startIndex = neighborStartIndex[pair.x];
int offset = atom_add(numNeighborsForAtom+pair.x, 1); int offset = ATOMIC_ADD(numNeighborsForAtom+pair.x, 1);
neighbors[startIndex+offset] = pair.y; neighbors[startIndex+offset] = pair.y;
} }
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -16,60 +12,69 @@ typedef struct { ...@@ -16,60 +12,69 @@ typedef struct {
* Find the maximum of a value across all threads in a warp, and return that to * Find the maximum of a value across all threads in a warp, and return that to
* every thread. * every thread.
*/ */
int reduceMax(int val, __local int* temp) { DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
int indexInWarp = get_local_id(0)%32; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
temp[get_local_id(0)] = val; // CUDA lets us do this slightly more efficiently by using shuffle operations.
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
return val;
#else
int indexInWarp = LOCAL_ID%32;
temp[LOCAL_ID] = val;
SYNC_WARPS; SYNC_WARPS;
for (int offset = 16; offset > 0; offset /= 2) { for (int offset = 16; offset > 0; offset /= 2) {
if (offset < indexInWarp) if (offset < indexInWarp)
temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]); temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]);
SYNC_WARPS; SYNC_WARPS;
} }
return temp[get_local_id(0)-indexInWarp]; return temp[LOCAL_ID-indexInWarp];
#endif
} }
#ifndef SUPPORTS_64_BIT_ATOMICS
/** /**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within * This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure * a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom. * that only one thread updates the force on any given atom.
*/ */
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) { void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
localData[get_local_id(0)].x = atomIndex; localData[LOCAL_ID].x = atomIndex;
SYNC_WARPS; SYNC_WARPS;
real4 forceSum = (real4) 0; real4 forceSum = make_real4(0);
int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE; int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
int end = start+32; int end = start+32;
bool isFirst = true; bool isFirst = true;
for (int i = start; i < end; i++) for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) { if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0); forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= get_local_id(0)); isFirst &= (i >= LOCAL_ID);
} }
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS; unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst) if (isFirst)
forceBuffers[offset] += forceSum; forceBuffers[offset] += forceSum;
SYNC_WARPS; SYNC_WARPS;
} }
#endif
__kernel void computeInteractionGroups( KERNEL void computeInteractionGroups(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else #else
__global real4* restrict forceBuffers, GLOBAL real4* RESTRICT forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
__global int* restrict numGroupTiles, int useNeighborList, GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
mixed energy = 0; mixed energy = 0;
INIT_DERIVATIVES INIT_DERIVATIVES
__local AtomData localData[LOCAL_MEMORY_SIZE]; LOCAL AtomData localData[LOCAL_MEMORY_SIZE];
__local int reductionBuffer[LOCAL_MEMORY_SIZE]; LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps); const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps); const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
...@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups( ...@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups(
const int exclusions = atomData.w; const int exclusions = atomData.w;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
real4 force = (real4) (0); real3 force = make_real3(0);
real4 posq2 = posq[atom2]; real4 posq2 = posq[atom2];
localData[get_local_id(0)].x = posq2.x; localData[LOCAL_ID].x = posq2.x;
localData[get_local_id(0)].y = posq2.y; localData[LOCAL_ID].y = posq2.y;
localData[get_local_id(0)].z = posq2.z; localData[LOCAL_ID].z = posq2.z;
localData[get_local_id(0)].q = posq2.w; localData[LOCAL_ID].q = posq2.w;
LOAD_LOCAL_PARAMETERS LOAD_LOCAL_PARAMETERS
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
int tj = tgx; int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer); int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
SYNC_WARPS; SYNC_WARPS;
...@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups( ...@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups(
if (j < rangeEnd) { if (j < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q); posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups( ...@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups(
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
delta *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[localIndex].fx += delta.x; localData[localIndex].fx += delta.x;
localData[localIndex].fy += delta.y; localData[localIndex].fy += delta.y;
localData[localIndex].fz += delta.z; localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
} }
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS; SYNC_WARPS;
} }
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) { if (exclusions != 0) {
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
} }
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
SYNC_WARPS;
#else #else
writeForces(forceBuffers, localData, atom2); writeForces(forceBuffers, localData, atom2);
localData[get_local_id(0)].fx = force.x; localData[LOCAL_ID].fx = force.x;
localData[get_local_id(0)].fy = force.y; localData[LOCAL_ID].fy = force.y;
localData[get_local_id(0)].fz = force.z; localData[LOCAL_ID].fz = force.z;
writeForces(forceBuffers, localData, atom1); writeForces(forceBuffers, localData, atom1);
#endif #endif
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_DERIVATIVES SAVE_DERIVATIVES
} }
...@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups( ...@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups(
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is * If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread. * executed by a single thread.
*/ */
__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) { KERNEL void prepareToBuildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles) {
if (rebuildNeighborList[0] == 1) if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0; numGroupTiles[0] = 0;
} }
...@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL ...@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL
* Filter the list of tiles to include only ones that have interactions within the * Filter the list of tiles to include only ones that have interactions within the
* padded cutoff. * padded cutoff.
*/ */
__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles, KERNEL void buildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles,
__global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData, GLOBAL int4* RESTRICT filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately. // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
...@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (rebuildNeighborList[0] == 0) if (rebuildNeighborList[0] == 0)
return; return;
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex const unsigned int local_warp = LOCAL_ID/TILE_SIZE; // local warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
__local real4 localPos[LOCAL_MEMORY_SIZE]; LOCAL real4 localPos[LOCAL_MEMORY_SIZE];
__local volatile bool anyInteraction[WARPS_IN_BLOCK]; LOCAL volatile bool anyInteraction[WARPS_IN_BLOCK];
__local volatile int tileIndex[WARPS_IN_BLOCK]; LOCAL volatile int tileIndex[WARPS_IN_BLOCK];
__local int reductionBuffer[LOCAL_MEMORY_SIZE]; LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = warp*NUM_TILES/totalWarps; const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps; const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
...@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
const int rangeEnd = (atomData.z>>16)&0xFFFF; const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w; const int exclusions = atomData.w;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
localPos[get_local_id(0)] = posq[atom2]; localPos[LOCAL_ID] = posq[atom2];
if (tgx == 0) if (tgx == 0)
anyInteraction[local_warp] = false; anyInteraction[local_warp] = false;
int tj = tgx; int tj = tgx;
...@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
SYNC_WARPS; SYNC_WARPS;
for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) { for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
SYNC_WARPS; SYNC_WARPS;
if (j < rangeEnd) { if (j < rangeEnd && tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0); real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (anyInteraction[local_warp]) { if (anyInteraction[local_warp]) {
SYNC_WARPS; SYNC_WARPS;
if (tgx == 0) if (tgx == 0)
tileIndex[local_warp] = atomic_add(numGroupTiles, 1); tileIndex[local_warp] = ATOMIC_ADD(numGroupTiles, 1);
SYNC_WARPS; SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData; filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
} }
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
/** /**
* Calculate the ellipsoid coordinate frames and associated matrices. * Calculate the ellipsoid coordinate frames and associated matrices.
*/ */
extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices, KERNEL void computeEllipsoidFrames(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix, GLOBAL const float4* RESTRICT sigParams, GLOBAL const float4* RESTRICT scale, GLOBAL real* RESTRICT aMatrix,
real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) { GLOBAL real* RESTRICT bMatrix, GLOBAL real* RESTRICT gMatrix, GLOBAL const int* sortedParticles) {
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) { for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
// Compute the local coordinate system of the ellipsoid; // Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex]; int originalIndex = sortedParticles[sortedIndex];
...@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* ...@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
// Compute matrices we will need later. // Compute matrices we will need later.
real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9); GLOBAL real (*a)[3] = (GLOBAL real (*)[3]) (aMatrix+sortedIndex*9);
real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9); GLOBAL real (*b)[3] = (GLOBAL real (*)[3]) (bMatrix+sortedIndex*9);
real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9); GLOBAL real (*g)[3] = (GLOBAL real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x; a[0][0] = xdir.x;
a[0][1] = xdir.y; a[0][1] = xdir.y;
a[0][2] = xdir.z; a[0][2] = xdir.z;
...@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* ...@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, GLOBAL const int* sortedAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter,
real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) { GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighborBlockCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = GLOBAL_ID;
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]]; real4 pos = posq[sortedAtoms[base]];
...@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
if (blockIdx.x*blockDim.x+threadIdx.x == 0) if (GLOBAL_ID == 0)
*neighborBlockCount = 0; *neighborBlockCount = 0;
} }
/** /**
* This is called by findNeighbors() to write a block to the neighbor list. * This is called by findNeighbors() to write a block to the neighbor list.
*/ */
__device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors, DEVICE void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) { GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount) {
int blockIndex = atomicAdd(neighborBlockCount, 1); int blockIndex = ATOMIC_ADD(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks) if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list. return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1; neighborIndex[blockIndex] = atom1;
...@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf ...@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf
/** /**
* Build a list of neighbors for each atom. * Build a list of neighbors for each atom.
*/ */
extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) { GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE; const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE]; int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) { for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
int nextExclusion = exclusionStartIndex[atom1]; int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1]; int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1]; real4 pos = sortedPos[atom1];
...@@ -178,8 +178,8 @@ typedef struct { ...@@ -178,8 +178,8 @@ typedef struct {
real a[3][3], b[3][3], g[3][3]; real a[3][3], b[3][3], g[3][3];
} AtomData; } AtomData;
__device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams, DEVICE void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, GLOBAL const real4* RESTRICT pos, GLOBAL const float4* RESTRICT sigParams,
const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) { GLOBAL const float2* RESTRICT epsParams, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix) {
data->sig = sigParams[originalIndex]; data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex]; data->eps = epsParams[originalIndex];
data->pos = trimTo3(pos[sortedIndex]); data->pos = trimTo3(pos[sortedIndex]);
...@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, ...@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex,
} }
} }
inline __device__ real3 matrixVectorProduct(real (*m)[3], real3 v) { inline DEVICE real3 matrixVectorProduct(real (*m)[3], real3 v) {
return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z, return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z, m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z); m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
} }
inline __device__ real3 vectorMatrixProduct(real3 v, real (*m)[3]) { inline DEVICE real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z, return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z, m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z); m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
} }
inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) { inline DEVICE void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0]; result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1]; result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2]; result[0][2] = a[0][2]+b[0][2];
...@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) ...@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3])
result[2][2] = a[2][2]+b[2][2]; result[2][2] = a[2][2]+b[2][2];
} }
inline __device__ real determinant(real (*m)[3]) { inline DEVICE real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] - return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]); m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
} }
inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) { inline DEVICE void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m)); real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]); result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]); result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
...@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) { ...@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]); result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
} }
__device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) { DEVICE void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
real rInv = RSQRT(r2); real rInv = RSQRT(r2);
real r = r2*rInv; real r = r2*rInv;
real3 drUnit = dr*rInv; real3 drUnit = dr*rInv;
...@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig ...@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig
/** /**
* Compute the interactions. * Compute the interactions.
*/ */
extern "C" __global__ void computeForce( KERNEL void computeForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT torqueBuffers,
int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos, int numAtoms, int numExceptions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT pos,
const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms, GLOBAL const float4* RESTRICT sigParams, GLOBAL const float2* RESTRICT epsParams, GLOBAL const int* RESTRICT sortedAtoms,
const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix,
const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex,
const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams GLOBAL const int4* RESTRICT exceptionParticles, GLOBAL const float2* RESTRICT exceptionParams
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, , int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif #endif
) { ) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
mixed energy = 0; mixed energy = 0;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount; const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks) if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) { for (int block = GLOBAL_ID; block < numBlocks; block += GLOBAL_SIZE) {
// Load parameters for atom1. // Load parameters for atom1.
int atom1 = neighborIndex[block]; int atom1 = neighborIndex[block];
...@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce( ...@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
} }
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
} }
#else #else
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) { for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
// Load parameters for atom1. // Load parameters for atom1.
int index1 = sortedAtoms[atom1]; int index1 = sortedAtoms[atom1];
...@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce( ...@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
} }
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
} }
#endif #endif
// Now compute exceptions. // Now compute exceptions.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < numExceptions; index += GLOBAL_SIZE) {
int4 atomIndices = exceptionParticles[index]; int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index]; float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y; int index1 = atomIndices.x, index2 = atomIndices.y;
...@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce( ...@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce(
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/** /**
* Convert the torques to forces on the connected particles. * Convert the torques to forces on the connected particles.
*/ */
extern "C" __global__ void applyTorques( KERNEL void applyTorques(
unsigned long long* __restrict__ forceBuffers, long long* __restrict__ torqueBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const mm_long* RESTRICT torqueBuffers,
int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices, int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
const int* sortedParticles) { GLOBAL const int* sortedParticles) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) { for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
int originalIndex = sortedParticles[sortedIndex]; int originalIndex = sortedParticles[sortedIndex];
real3 pos = trimTo3(posq[originalIndex]); real3 pos = trimTo3(posq[originalIndex]);
int2 axisParticles = axisParticleIndices[originalIndex]; int2 axisParticles = axisParticleIndices[originalIndex];
...@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques( ...@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques(
yforce += f; yforce += f;
force -= f; force -= f;
} }
atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x], (mm_ulong) ((mm_long) (xforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.z*0x100000000)));
if (axisParticles.y != -1) { if (axisParticles.y != -1) {
atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y], (mm_ulong) ((mm_long) (yforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.z*0x100000000)));
} }
} }
} }
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -13,26 +10,26 @@ typedef struct { ...@@ -13,26 +10,26 @@ typedef struct {
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum, GLOBAL mm_ulong* RESTRICT global_bornSum,
#else #else
__global real* restrict global_bornSum, GLOBAL real* RESTRICT global_bornSum,
#endif #endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE]; LOCAL AtomData1 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -42,7 +39,7 @@ __kernel void computeBornSum( ...@@ -42,7 +39,7 @@ __kernel void computeBornSum(
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real bornSum = 0.0f; real bornSum = 0;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
...@@ -50,15 +47,15 @@ __kernel void computeBornSum( ...@@ -50,15 +47,15 @@ __kernel void computeBornSum(
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x; localData[LOCAL_ID].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[LOCAL_ID].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[LOCAL_ID].z = posq1.z;
localData[get_local_id(0)].q = charge1; localData[LOCAL_ID].q = charge1;
localData[get_local_id(0)].radius = params1.x; localData[LOCAL_ID].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[LOCAL_ID].scaledRadius = params1.y;
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -70,7 +67,7 @@ __kernel void computeBornSum( ...@@ -70,7 +67,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -91,21 +88,21 @@ __kernel void computeBornSum( ...@@ -91,21 +88,21 @@ __kernel void computeBornSum(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[LOCAL_ID].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f; localData[LOCAL_ID].bornSum = 0.0f;
SYNC_WARPS; SYNC_WARPS;
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -117,7 +114,7 @@ __kernel void computeBornSum( ...@@ -117,7 +114,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -151,17 +148,17 @@ __kernel void computeBornSum( ...@@ -151,17 +148,17 @@ __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
} }
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
if (x != y) if (x != y)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif #endif
} }
...@@ -172,17 +169,17 @@ __kernel void computeBornSum( ...@@ -172,17 +169,17 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
real bornSum = 0; real bornSum = 0;
...@@ -213,10 +210,10 @@ __kernel void computeBornSum( ...@@ -213,10 +210,10 @@ __kernel void computeBornSum(
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -238,17 +235,17 @@ __kernel void computeBornSum( ...@@ -238,17 +235,17 @@ __kernel void computeBornSum(
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[LOCAL_ID].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f; localData[LOCAL_ID].bornSum = 0.0f;
} }
SYNC_WARPS; SYNC_WARPS;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -258,17 +255,17 @@ __kernel void computeBornSum( ...@@ -258,17 +255,17 @@ __kernel void computeBornSum(
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -304,7 +301,7 @@ __kernel void computeBornSum( ...@@ -304,7 +301,7 @@ __kernel void computeBornSum(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -317,7 +314,7 @@ __kernel void computeBornSum( ...@@ -317,7 +314,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -350,20 +347,20 @@ __kernel void computeBornSum( ...@@ -350,20 +347,20 @@ __kernel void computeBornSum(
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS)
atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
#else #else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif #endif
} }
pos++; pos++;
...@@ -381,28 +378,28 @@ typedef struct { ...@@ -381,28 +378,28 @@ typedef struct {
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
#else #else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce, GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
__global const real* restrict global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
mixed energy = 0; mixed energy = 0;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE]; LOCAL AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1( ...@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1(
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real4 force = 0.0f; real4 force = make_real4(0);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
...@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1( ...@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1(
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); localData[LOCAL_ID].x = posq1.x;
localData[localAtomIndex].x = posq1.x; localData[LOCAL_ID].y = posq1.y;
localData[localAtomIndex].y = posq1.y; localData[LOCAL_ID].z = posq1.z;
localData[localAtomIndex].z = posq1.z; localData[LOCAL_ID].q = charge1;
localData[localAtomIndex].q = charge1; localData[LOCAL_ID].bornRadius = bornRadius1;
localData[get_local_id(0)].bornRadius = bornRadius1;
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z); real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real charge2 = localData[tbx+j].q; real charge2 = localData[tbx+j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1( ...@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1( ...@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[LOCAL_ID].fw = 0.0f;
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1( ...@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1( ...@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
} }
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w; global_bornForce[offset1] += force.w;
if (x != y) { if (x != y) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw; global_bornForce[offset2] += localData[LOCAL_ID].fw;
} }
#endif #endif
} }
...@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1( ...@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
real4 force = 0; real4 force = make_real4(0);
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
...@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1( ...@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1(
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1( ...@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1(
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[LOCAL_ID].fw = 0.0f;
} }
SYNC_WARPS; SYNC_WARPS;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1( ...@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1(
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1( ...@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1( ...@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1( ...@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1( ...@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1(
SYNC_WARPS; SYNC_WARPS;
} }
} }
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
atom_add(&global_bornForce[atom2], (long) (localData[get_local_id(0)].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
} }
#else #else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w; global_bornForce[offset1] += force.w;
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw; global_bornForce[offset2] += localData[LOCAL_ID].fw;
} }
#endif #endif
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
real invRSquaredOver4 = 0.25f*invR*invR; real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+OBC_PARAMS2.y; real rScaledRadiusJ = r+OBC_PARAMS2.y;
real rScaledRadiusI = r+OBC_PARAMS1.y; real rScaledRadiusI = r+OBC_PARAMS1.y;
real l_ijJ = RECIP(max(OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y))); real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max(OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y))); real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real u_ijJ = RECIP(rScaledRadiusJ); real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI); real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ; real l_ij2J = l_ijJ*l_ijJ;
...@@ -16,12 +16,17 @@ ...@@ -16,12 +16,17 @@
real t2I = (l_ij2I-u_ij2I); real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR; real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR; real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
#ifdef SUPPORTS_64_BIT_ATOMICS
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0); real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0); tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
#else
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif #endif
dEdR += (includeInteraction ? tempdEdR : 0); dEdR += (includeInteraction ? tempdEdR : (real) 0);
} }
...@@ -5,22 +5,21 @@ ...@@ -5,22 +5,21 @@
* Reduce the Born sums to compute the Born radii. * Reduce the Born sums to compute the Born radii.
*/ */
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma, KERNEL void reduceBornSum(float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornSum, GLOBAL const mm_long* RESTRICT bornSum,
#else #else
__global const real* restrict bornSum, GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
#endif #endif
__global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) { GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
unsigned int index = get_global_id(0); for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) {
// Get summed Born data // Get summed Born data
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
real sum = (1/(real) 0x100000000)*bornSum[index]; real sum = RECIP((real) 0x100000000)*bornSum[index];
#else #else
real sum = bornSum[index]; real sum = bornSum[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i]; sum += bornSum[i];
#endif #endif
...@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
real sum3 = sum*sum2; real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3); real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET; real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius); real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2); real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius; chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius; bornRadii[index] = radius;
obcChain[index] = chain; obcChain[index] = chain;
index += get_global_size(0);
} }
} }
...@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force. * Reduce the Born force.
*/ */
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce, KERNEL void reduceBornForce(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn, GLOBAL mm_long* RESTRICT bornForce,
#else
GLOBAL real* bornForce, int bufferSize, int numBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) { GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
mixed energy = 0; mixed energy = 0;
unsigned int index = get_global_id(0); for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) { // Get summed Born force
// Sum the Born force
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
real force = (1/(real) 0x100000000)*bornForceIn[index]; real force = RECIP((real) 0x100000000)*bornForce[index];
#else #else
real force = bornForce[index]; real force = bornForce[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i]; force += bornForce[i];
#endif #endif
...@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor ...@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
float offsetRadius = params[index].x; float offsetRadius = params[index].x;
real bornRadius = bornRadii[index]; real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS; real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6); real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6; real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius; force += saTerm/bornRadius;
energy += saTerm; energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index]; force *= bornRadius*bornRadius*obcChain[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce[index] = (mm_long) (force*0x100000000);
#else
bornForce[index] = force; bornForce[index] = force;
index += get_global_size(0); #endif
} }
energyBuffer[get_global_id(0)] += energy/-6.0f; energyBuffer[GLOBAL_ID] += energy/-6;
} }
\ No newline at end of file
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -12,27 +8,27 @@ typedef struct { ...@@ -12,27 +8,27 @@ typedef struct {
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum, GLOBAL mm_long* RESTRICT global_bornSum,
#else #else
__global real* restrict global_bornSum, GLOBAL real* RESTRICT global_bornSum,
#endif #endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* exclusionTiles) {
__local AtomData1 localData[TILE_SIZE]; LOCAL AtomData1 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -56,17 +52,17 @@ __kernel void computeBornSum( ...@@ -56,17 +52,17 @@ __kernel void computeBornSum(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0.0f; real bornSum = 0;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(trimTo3(delta), trimTo3(delta));
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
...@@ -74,7 +70,7 @@ __kernel void computeBornSum( ...@@ -74,7 +70,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -92,9 +88,9 @@ __kernel void computeBornSum( ...@@ -92,9 +88,9 @@ __kernel void computeBornSum(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -110,9 +106,9 @@ __kernel void computeBornSum( ...@@ -110,9 +106,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -124,7 +120,7 @@ __kernel void computeBornSum( ...@@ -124,7 +120,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -154,9 +150,9 @@ __kernel void computeBornSum( ...@@ -154,9 +150,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -166,9 +162,9 @@ __kernel void computeBornSum( ...@@ -166,9 +162,9 @@ __kernel void computeBornSum(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[tgx].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_long) (localData[tgx].bornSum*0x100000000));
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum; global_bornSum[offset] += localData[tgx].bornSum;
#endif #endif
} }
...@@ -182,15 +178,15 @@ __kernel void computeBornSum( ...@@ -182,15 +178,15 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
bool includeTile = true; bool includeTile = true;
...@@ -263,15 +259,15 @@ __kernel void computeBornSum( ...@@ -263,15 +259,15 @@ __kernel void computeBornSum(
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j]; int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -301,9 +297,9 @@ __kernel void computeBornSum( ...@@ -301,9 +297,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -319,9 +315,9 @@ __kernel void computeBornSum( ...@@ -319,9 +315,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -334,7 +330,7 @@ __kernel void computeBornSum( ...@@ -334,7 +330,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -364,9 +360,9 @@ __kernel void computeBornSum( ...@@ -364,9 +360,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -382,9 +378,9 @@ __kernel void computeBornSum( ...@@ -382,9 +378,9 @@ __kernel void computeBornSum(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom2], (mm_long) (localData[tgx].bornSum*0x100000000));
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum; global_bornSum[offset] += localData[tgx].bornSum;
#endif #endif
} }
...@@ -405,29 +401,29 @@ typedef struct { ...@@ -405,29 +401,29 @@ typedef struct {
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce, GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
#else #else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce, GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
__global const real* restrict global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* exclusionTiles) {
mixed energy = 0; mixed energy = 0;
__local AtomData2 localData[TILE_SIZE]; LOCAL AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1( ...@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1( ...@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
} }
} }
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1( ...@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1(
} }
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1( ...@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1( ...@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1( ...@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_long) (localData[tgx].fw*0x100000000));
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
...@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1( ...@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
bool includeTile = true; bool includeTile = true;
...@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1( ...@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1(
} }
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j]; int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
...@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1( ...@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1( ...@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1( ...@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1( ...@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1( ...@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1( ...@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[atom2], (long) (localData[tgx].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[atom2], (mm_long) (localData[tgx].fw*0x100000000));
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
...@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1( ...@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1(
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment