"_gfortran_utils.sh" did not exist on "d72e231df44227be2d9f10602627593c3c1ea0df"
Unverified Commit edbc8407 authored by peastman's avatar peastman Committed by GitHub
Browse files

Common compute framework to unify CUDA and OpenCL code (#2488)

* Began creating common compute framework to unify code between CUDA and OpenCL

* Began OpenCL implementation of common compute framework

* Common implementation of CMMotionRemover

* CUDA implementation of common compute interface

* Converted HarmonicBondForce to common compute API

* Converted standard bonded forces to common compute API

* Converted ExpressionUtilities to common compute API

* Created ComputeParameterSet

* Converted custom bonded forces to common compute API

* Converted CustomCentroidBondForce to common compute API

* Converted CustomManyParticleForce to common compute API

* Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext

* Converted GayBerneForce to common compute API

* Removed obsolete kernels

* Converted verlet integrators to common compute API

* Converted Langevin and Brownian integrators to common compute API

* Converted CustomIntegrator to common compute API

* Converted CustomNonbondedForce to common compute API

* Removed uses of a deprecated API

* Fixed failing test cases

* Converted GBSAOBCForce to common compute API

* Began converting CustomGBForce to common compute API

* Finished converting CustomGBForce to common compute API

* Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities

* Converted RMSDForce and AndersenThermostat to common compute API

* Converted CustomHbondForce to common compute API

* Merged scripts for encoding kernel sources

* Converted Drude plugin to common compute API

* Fixed errors in CMake scripts

* Attempt at fixing errors on Windows

* Added discussion of common compute API to developer guide

* Added Windows export macro for common classes

* Fixed error in CMMotionRemover

* Ubdated travis to newer Ubuntu version

* Fixed errors on CPU OpenCL

* Fixed Windows linking errors

* Added missing pragma for 32 bit atomics

* Replaced long long with mm_long

* More fixes to Windows linking

* Bug fix
parent 38beeefe
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[LOCAL_ID]*0x100000000)));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
#endif
/**
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(
KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
__global real4* restrict forceBuffers,
GLOBAL real4* RESTRICT forceBuffers,
#endif
__global mixed* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles, int needEnergy,
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = LOCAL_ID - tgx;
mixed energy = 0;
INIT_PARAM_DERIVS
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
......@@ -53,14 +55,14 @@ __kernel void computeN2Energy(
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
const unsigned int localAtomIndex = LOCAL_ID;
local_pos[localAtomIndex] = pos1;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
}
if (needEnergy)
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
......@@ -98,11 +102,11 @@ __kernel void computeN2Energy(
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = LOCAL_ID;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0;
local_force[localAtomIndex] = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
SYNC_WARPS;
#ifdef USE_EXCLUSIONS
......@@ -111,8 +115,8 @@ __kernel void computeN2Energy(
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -126,7 +130,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1.0f;
const real interactionScale = 1;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
......@@ -136,10 +140,12 @@ __kernel void computeN2Energy(
}
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
local_force[atom2] += delta;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
......@@ -151,20 +157,20 @@ __kernel void computeN2Energy(
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
STORE_DERIVATIVES_2
}
#else
......@@ -175,7 +181,7 @@ __kernel void computeN2Energy(
STORE_DERIVATIVES_1
if (x != y) {
offset = offset2;
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
STORE_DERIVATIVES_2
}
#endif
......@@ -188,21 +194,21 @@ __kernel void computeN2Energy(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
skipTiles[LOCAL_ID] = -1;
while (pos < end) {
const bool isExcluded = false;
real4 force = 0;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
......@@ -231,10 +237,10 @@ __kernel void computeN2Energy(
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
......@@ -247,20 +253,20 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = LOCAL_ID;
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0;
local_force[localAtomIndex] = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
SYNC_WARPS;
......@@ -270,14 +276,14 @@ __kernel void computeN2Energy(
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -286,17 +292,19 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1.0f;
const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
local_force[atom2] += delta;
RECORD_DERIVATIVE_2
}
tj = (tj + 1) & (TILE_SIZE - 1);
......@@ -311,8 +319,8 @@ __kernel void computeN2Energy(
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -326,17 +334,19 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1.0f;
const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
local_force[atom2] += delta;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
......@@ -347,22 +357,22 @@ __kernel void computeN2Energy(
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
unsigned int atom2 = atomIndices[LOCAL_ID];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
unsigned int offset = atom1;
STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (local_force[get_local_id(0)].x*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
offset = atom2;
STORE_DERIVATIVES_2
}
......@@ -373,7 +383,7 @@ __kernel void computeN2Energy(
unsigned int offset = offset1;
STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
offset = offset2;
STORE_DERIVATIVES_2
}
......@@ -381,6 +391,6 @@ __kernel void computeN2Energy(
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS
}
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[tgx]*0x100000000));
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[tgx]*0x100000000)));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
......@@ -10,30 +9,33 @@
/**
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(
KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
__global real4* restrict forceBuffers,
GLOBAL real4* RESTRICT forceBuffers,
#endif
__global mixed* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles, int needEnergy,
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles, int needEnergy,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
mixed energy = 0;
INIT_PARAM_DERIVS
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
......@@ -43,7 +45,7 @@ __kernel void computeN2Energy(
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if (x == y) {
......@@ -56,15 +58,15 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta, delta);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
......@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
......@@ -98,12 +102,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
......@@ -123,11 +127,11 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -153,8 +157,10 @@ __kernel void computeN2Energy(
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
......@@ -170,12 +176,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
......@@ -186,12 +192,12 @@ __kernel void computeN2Energy(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
......@@ -206,15 +212,15 @@ __kernel void computeN2Energy(
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
LOCAL int atomIndices[TILE_SIZE];
while (pos < end) {
const bool isExcluded = false;
......@@ -261,7 +267,7 @@ __kernel void computeN2Energy(
#endif
atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
......@@ -274,17 +280,17 @@ __kernel void computeN2Energy(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
real3 pos1 = trimTo3(posq[atom1]);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = dot(delta.xyz, delta.xyz);
if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -298,8 +304,10 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION
dEdR /= -r;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
......@@ -310,12 +318,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
......@@ -330,11 +338,11 @@ __kernel void computeN2Energy(
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -355,10 +363,12 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION
dEdR /= -r;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
local_force[atom2] += delta;
RECORD_DERIVATIVE_2
}
}
......@@ -367,12 +377,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
......@@ -389,13 +399,13 @@ __kernel void computeN2Energy(
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (local_force[tgx].x*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
unsigned int offset = atom2;
STORE_DERIVATIVES_2
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
......@@ -404,6 +414,6 @@ __kernel void computeN2Energy(
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS
}
......@@ -9,24 +9,29 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq
KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) {
mixed energy = 0;
INIT_PARAM_DERIVS
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the derivatives
#ifndef SUPPORTS_64_BIT_ATOMICS
int totalSize = bufferSize*numBuffers;
#endif
REDUCE_DERIVATIVES
// Now calculate the per-particle energy terms.
real4 pos = posq[index];
real4 force = (real4) 0;
real3 force = make_real3(0, 0, 0);
COMPUTE_ENERGY
index += get_global_size(0);
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
SAVE_PARAM_DERIVS
}
......@@ -2,17 +2,30 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers
#endif
PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS
const real scale = RECIP((real) 0x100000000);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
real4 pos = posq[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
#else
real3 force = trimTo3(forceBuffers[index]);
#endif
COMPUTE_FORCES
forceBuffers[index] = (long long) (force.x*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000);
#ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers[index] = (mm_long) (force.x*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS] = (mm_long) (force.y*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (mm_long) (force.z*0x100000000);
#else
forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
#endif
}
SAVE_PARAM_DERIVS
}
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles,
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
GLOBAL mm_ulong* RESTRICT global_value,
#else
__global real* restrict global_value,
GLOBAL real* RESTRICT global_value,
#endif
__local real* restrict local_value,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = LOCAL_ID - tgx;
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real local_value[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
......@@ -44,14 +42,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
const unsigned int localAtomIndex = LOCAL_ID;
local_pos[localAtomIndex] = pos1;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -87,9 +85,9 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = LOCAL_ID;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0;
SYNC_WARPS;
......@@ -99,8 +97,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -141,11 +139,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
STORE_PARAM_DERIVS1
if (x != y) {
unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000));
ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
STORE_PARAM_DERIVS2
}
#else
......@@ -154,7 +152,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
global_value[offset1] += value;
STORE_PARAM_DERIVS1
if (x != y) {
global_value[offset2] += local_value[get_local_id(0)];
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
......@@ -167,17 +165,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
skipTiles[LOCAL_ID] = -1;
while (pos < end) {
real value = 0;
......@@ -208,10 +206,10 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
......@@ -225,17 +223,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = LOCAL_ID;
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0;
}
......@@ -246,14 +244,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -278,12 +276,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[atom2];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -313,19 +311,19 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
unsigned int atom2 = atomIndices[LOCAL_ID];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2;
atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000));
ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
STORE_PARAM_DERIVS2
}
#else
......@@ -334,7 +332,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[get_local_id(0)];
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles,
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const ushort2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
GLOBAL mm_ulong* RESTRICT global_value,
#else
__global real* restrict global_value,
GLOBAL real* RESTRICT global_value,
#endif
__local real* restrict local_value,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
LOCAL real local_value[LOCAL_BUFFER_SIZE];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
......@@ -35,7 +33,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if (x == y) {
......@@ -47,11 +45,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -88,7 +86,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
......@@ -107,11 +105,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -150,7 +148,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
......@@ -163,7 +161,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = y*TILE_SIZE+tgx;
atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
#else
unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
......@@ -180,15 +178,15 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
#else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
int pos = (int) (get_group_id(0)*(mm_long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(mm_long)numTiles/get_num_groups(0));
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
LOCAL int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
......@@ -234,7 +232,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) {
local_posq[localAtomIndex] = posq[j];
local_pos[localAtomIndex] = trimTo3(posq[j]);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0;
}
......@@ -246,16 +244,16 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real4 posq1 = posq[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
real3 pos1 = trimTo3(posq[atom1]);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = dot(delta.xyz, delta.xyz);
if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -277,7 +275,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
......@@ -293,11 +291,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real4 posq1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real3 pos2 = local_pos[j];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -326,7 +324,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
atom_add(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
......@@ -346,7 +344,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = atom2;
atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
#else
unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
......
......@@ -2,19 +2,18 @@
* Reduce a pairwise computed value, and compute per-particle values.
*/
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global real4* posq,
KERNEL void computePerParticleValues(GLOBAL real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* valueBuffers
GLOBAL mm_long* valueBuffers
#else
__global real* valueBuffers
GLOBAL real* valueBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = (1.0f/0x100000000)*valueBuffers[index];
real sum = valueBuffers[index]/(real) 0x100000000;
#else
int totalSize = bufferSize*numBuffers;
real sum = valueBuffers[index];
......@@ -27,6 +26,5 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global
real4 pos = posq[index];
COMPUTE_VALUES
index += get_global_size(0);
}
}
......@@ -2,8 +2,8 @@
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
inline DEVICE real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result)
#endif
......@@ -14,73 +14,79 @@ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxS
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real computeAngle(real4 vec1, real4 vec2) {
inline DEVICE real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 crossProduct = cross(vec1, vec2);
real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = PI-angle;
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0)
angle = M_PI-angle;
}
else
angle = acos(cosine);
angle = ACOS(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
real4 computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
KERNEL void computeDonorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
mixed energy = 0;
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += GLOBAL_SIZE) {
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+get_global_id(0);
int donorIndex = donorStart+GLOBAL_ID;
int4 atoms, exclusionIndices;
real4 d1, d2, d3;
if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex];
#endif
}
else
atoms = (int4) (-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_local_size(0)) {
atoms = make_int4(-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += LOCAL_SIZE) {
// Load the next block of acceptors into local memory.
barrier(CLK_LOCAL_MEM_FENCE);
int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
SYNC_THREADS;
int blockSize = min((int) LOCAL_SIZE, NUM_ACCEPTORS-acceptorStart);
if (LOCAL_ID < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+LOCAL_ID];
posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
}
SYNC_THREADS;
if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) {
int acceptorIndex = acceptorStart+index;
......@@ -108,6 +114,26 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
// Write results
if (donorIndex < NUM_DONORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
MEM_FENCE;
}
#else
int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
......@@ -127,49 +153,57 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
}
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
}
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
KERNEL void computeAcceptorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += GLOBAL_SIZE) {
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+get_global_id(0);
int acceptorIndex = acceptorStart+GLOBAL_ID;
int4 atoms, exclusionIndices;
real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex];
#endif
}
else
atoms = (int4) (-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_local_size(0)) {
atoms = make_int4(-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += LOCAL_SIZE) {
// Load the next block of donors into local memory.
barrier(CLK_LOCAL_MEM_FENCE);
int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
SYNC_THREADS;
int blockSize = min((int) LOCAL_SIZE, NUM_DONORS-donorStart);
if (LOCAL_ID < blockSize) {
int4 atoms2 = donorAtoms[donorStart+LOCAL_ID];
posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
}
SYNC_THREADS;
if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) {
int donorIndex = donorStart+index;
......@@ -197,6 +231,26 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
// Write results
if (acceptorIndex < NUM_ACCEPTORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
MEM_FENCE;
}
#else
int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
......@@ -216,6 +270,7 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
}
}
}
extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, float* result) {
__shared__ float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x;
KERNEL void computeFloatSum(GLOBAL const float* RESTRICT sumBuffer, GLOBAL float* result, int bufferSize) {
LOCAL float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = LOCAL_ID;
float sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
__syncthreads();
SYNC_THREADS;
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
......@@ -14,24 +14,26 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
*result = tempBuffer[0];
}
extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
__shared__ double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x;
#ifdef SUPPORTS_DOUBLE_PRECISION
KERNEL void computeDoubleSum(GLOBAL const double* RESTRICT sumBuffer, GLOBAL double* result, int bufferSize) {
LOCAL double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = LOCAL_ID;
double sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
__syncthreads();
SYNC_THREADS;
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
#endif
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
KERNEL void applyPositionDeltas(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta) {
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
......@@ -48,14 +50,14 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
#else
posq[index] = pos;
#endif
posDelta[index] = make_mixed4(0, 0, 0, 0);
posDelta[index] = make_mixed4(0);
}
}
extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
KERNEL void generateRandomNumbers(int numValues, GLOBAL float4* RESTRICT random, GLOBAL uint4* RESTRICT seed) {
uint4 state = seed[GLOBAL_ID];
unsigned int carry = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numValues; index += blockDim.x*gridDim.x) {
for (int index = GLOBAL_ID; index < numValues; index += GLOBAL_SIZE) {
// Generate three uniform random numbers.
state.x = state.x * 69069 + 1;
......@@ -93,5 +95,5 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
random[index] = make_float4(x1, x2, x3, 0.0f);
}
seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
seed[GLOBAL_ID] = state;
}
#ifdef SUPPORTS_DOUBLE_PRECISION
typedef double TempType;
typedef double3 TempType3;
typedef double4 TempType4;
#define make_TempType3(a...) make_double3(a)
#define make_TempType4(a...) make_double4(a)
#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
inline DEVICE mixed4 convertFromDouble4(double4 a) {
return make_mixed4(a.x, a.y, a.z, a.w);
}
#else
typedef float TempType;
typedef float3 TempType3;
typedef float4 TempType4;
#define make_TempType3(a...) make_float3(a)
#define make_TempType4(a...) make_float4(a)
#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
#endif
/**
* Load the position of a particle.
*/
inline DEVICE TempType4 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return make_TempType4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return convertToTempType4(posq[index]);
#endif
}
/**
* Store the position of a particle.
*/
inline DEVICE void storePos(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, int index, TempType4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = make_real4(pos.x, pos.y, pos.z, pos.w);
#endif
}
KERNEL void computePerDof(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta,
GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed2* RESTRICT dt, GLOBAL const mixed* RESTRICT globals,
GLOBAL mixed* RESTRICT sum, GLOBAL const float4* RESTRICT gaussianValues, unsigned int gaussianBaseIndex, GLOBAL const float4* RESTRICT uniformValues,
const mixed energy, GLOBAL mixed* RESTRICT energyParamDerivs
PARAMETER_ARGUMENTS) {
TempType3 stepSize = make_TempType3(dt[0].y);
int index = GLOBAL_ID;
const TempType forceScale = ((TempType) 1)/0xFFFFFFFF;
while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA
TempType4 position = loadPos(posq, posqCorrection, index) + convertToTempType4(posDelta[index]);
#else
TempType4 position = loadPos(posq, posqCorrection, index);
#endif
TempType4 velocity = convertToTempType4(velm[index]);
TempType3 f = make_TempType3(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2]);
TempType3 mass = make_TempType3(RECIP(velocity.w));
if (velocity.w != 0.0) {
int gaussianIndex = gaussianBaseIndex;
int uniformIndex = 0;
COMPUTE_STEP
}
index += GLOBAL_SIZE;
}
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
/**
* Record the force on an atom to global memory.
*/
inline void storeForce(int atom, real4 force, __global long* restrict forceBuffers) {
atom_add(&forceBuffers[atom], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result)
#endif
......@@ -26,36 +23,36 @@ inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPerio
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real computeAngle(real4 vec1, real4 vec2) {
DEVICE real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 crossProduct = cross(vec1, vec2);
real3 crossProduct = trimTo3(cross(vec1, vec2));
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = M_PI-angle;
}
else
angle = acos(cosine);
angle = ACOS(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline real4 computeCross(real4 vec1, real4 vec2) {
real4 cp = cross(vec1, vec2);
return (real4) (cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
real3 cp = trimTo3(cross(vec1, vec2));
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}
/**
* Determine whether a particular interaction is in the list of exclusions.
*/
inline bool isInteractionExcluded(int atom1, int atom2, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
inline DEVICE bool isInteractionExcluded(int atom1, int atom2, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
if (atom1 > atom2) {
int temp = atom1;
atom1 = atom2;
......@@ -76,24 +73,24 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global const int* rest
/**
* Compute the interaction.
*/
__kernel void computeInteraction(
__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq,
KERNEL void computeInteraction(
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF
, __global const int* restrict neighbors, __global const int* restrict neighborStartIndex
, GLOBAL const int* RESTRICT neighbors, GLOBAL const int* RESTRICT neighborStartIndex
#endif
#ifdef USE_FILTERS
, __global int* restrict particleTypes, __global int* restrict orderIndex, __global int* restrict particleOrder
, GLOBAL int* RESTRICT particleTypes, GLOBAL int* RESTRICT orderIndex, GLOBAL int* RESTRICT particleOrder
#endif
#ifdef USE_EXCLUSIONS
, __global int* restrict exclusions, __global int* restrict exclusionStartIndex
, GLOBAL int* RESTRICT exclusions, GLOBAL int* RESTRICT exclusionStartIndex
#endif
PARAMETER_ARGUMENTS) {
mixed energy = 0;
// Loop over particles to be the first one in the set.
for (int p1 = get_group_id(0); p1 < NUM_ATOMS; p1 += get_num_groups(0)) {
for (int p1 = GROUP_ID; p1 < NUM_ATOMS; p1 += NUM_GROUPS) {
#ifdef USE_CENTRAL_PARTICLE
const int a1 = p1;
#else
......@@ -110,7 +107,7 @@ __kernel void computeInteraction(
#endif
#endif
int numCombinations = NUM_CANDIDATE_COMBINATIONS;
for (int index = get_local_id(0); index < numCombinations; index += get_local_size(0)) {
for (int index = LOCAL_ID; index < numCombinations; index += LOCAL_SIZE) {
FIND_ATOMS_FOR_COMBINATION_INDEX;
bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF
......@@ -135,15 +132,15 @@ __kernel void computeInteraction(
}
}
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
}
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict numNeighborPairs) {
int index = get_global_id(0);
KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {
int index = GLOBAL_ID;
int base = index*TILE_SIZE;
while (base < NUM_ATOMS) {
real4 pos = posq[base];
......@@ -159,37 +156,39 @@ __kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, r
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = (real4) (min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = (real4) (max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += get_global_size(0);
index += GLOBAL_SIZE;
base = index*TILE_SIZE;
}
if (get_group_id(0) == 0 && get_local_id(0) == 0)
if (GROUP_ID == 0 && LOCAL_ID == 0)
*numNeighborPairs = 0;
}
/**
* Find a list of neighbors for each atom.
*/
__kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global const real4* restrict blockCenter, __global const real4* restrict blockBoundingBox, __global int2* restrict neighborPairs,
__global int* restrict numNeighborPairs, __global int* restrict numNeighborsForAtom, int maxNeighborPairs
KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox, GLOBAL int2* RESTRICT neighborPairs,
GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex
, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex
#endif
) {
__local real4 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
__local bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
int indexInWarp = get_local_id(0)%32;
int warpStart = get_local_id(0)-indexInWarp;
for (int atom1 = get_global_id(0); atom1 < PADDED_NUM_ATOMS; atom1 += get_global_size(0)) {
LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
int indexInWarp = LOCAL_ID%32;
#ifndef __CUDA_ARCH__
LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
int warpStart = LOCAL_ID-indexInWarp;
#endif
for (int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
// Load data for this atom. Note that all threads in a warp are processing atoms from the same block.
real4 pos1 = posq[atom1];
real3 pos1 = trimTo3(posq[atom1]);
int block1 = atom1/TILE_SIZE;
real4 blockCenter1 = blockCenter[block1];
real4 blockSize1 = blockBoundingBox[block1];
......@@ -221,10 +220,18 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// Loop over any blocks we identified as potentially containing neighbors.
includeBlockFlags[get_local_id(0)] = includeBlock2;
#ifdef __CUDA_ARCH__
int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
{
#else
includeBlockFlags[LOCAL_ID] = includeBlock2;
SYNC_WARPS;
for (int i = 0; i < TILE_SIZE; i++) {
if (includeBlockFlags[warpStart+i]) {
#endif
int block2 = block2Base+i;
// Loop over atoms in this block.
......@@ -233,12 +240,12 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int included[TILE_SIZE];
int numIncluded = 0;
SYNC_WARPS;
positionCache[get_local_id(0)] = posq[start+indexInWarp];
positionCache[LOCAL_ID] = trimTo3(posq[start+indexInWarp]);
SYNC_WARPS;
if (atom1 < NUM_ATOMS) {
for (int j = 0; j < 32; j++) {
int atom2 = start+j;
real4 pos2 = positionCache[get_local_id(0)-indexInWarp+j];
real3 pos2 = positionCache[LOCAL_ID-indexInWarp+j];
// Decide whether to include this atom pair in the neighbor list.
......@@ -260,10 +267,10 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// If we found any neighbors, store them to the neighbor list.
if (numIncluded > 0) {
int baseIndex = atom_add(numNeighborPairs, numIncluded);
int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
if (baseIndex+numIncluded <= maxNeighborPairs)
for (int j = 0; j < numIncluded; j++)
neighborPairs[baseIndex+j] = (int2) (atom1, included[j]);
neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
totalNeighborsForAtom1 += numIncluded;
}
}
......@@ -279,59 +286,59 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
__kernel void computeNeighborStartIndices(__global int* restrict numNeighborsForAtom, __global int* restrict neighborStartIndex,
__global int* restrict numNeighborPairs, int maxNeighborPairs) {
__local unsigned int posBuffer[256];
KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
GLOBAL int* RESTRICT numNeighborPairs, int maxNeighborPairs) {
LOCAL unsigned int posBuffer[256];
if (*numNeighborPairs > maxNeighborPairs) {
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start
// indices to indicate no neighbors for any atom.
for (int i = get_local_id(0); i <= NUM_ATOMS; i += get_local_size(0))
for (int i = LOCAL_ID; i <= NUM_ATOMS; i += LOCAL_SIZE)
neighborStartIndex[i] = 0;
return;
}
unsigned int globalOffset = 0;
for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += get_local_size(0)) {
for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += LOCAL_SIZE) {
// Load the neighbor counts into local memory.
unsigned int globalIndex = startAtom+get_local_id(0);
posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int globalIndex = startAtom+LOCAL_ID;
posBuffer[LOCAL_ID] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
SYNC_THREADS;
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < get_local_size(0); step *= 2) {
unsigned int add = (get_local_id(0) >= step ? posBuffer[get_local_id(0)-step] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
posBuffer[get_local_id(0)] += add;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID-step] : 0);
SYNC_THREADS;
posBuffer[LOCAL_ID] += add;
SYNC_THREADS;
}
// Write the results back to global memory.
if (globalIndex < NUM_ATOMS) {
neighborStartIndex[globalIndex+1] = posBuffer[get_local_id(0)]+globalOffset;
neighborStartIndex[globalIndex+1] = posBuffer[LOCAL_ID]+globalOffset;
numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
}
globalOffset += posBuffer[get_local_size(0)-1];
barrier(CLK_LOCAL_MEM_FENCE);
globalOffset += posBuffer[LOCAL_SIZE-1];
SYNC_THREADS;
}
if (get_local_id(0) == 0)
if (LOCAL_ID == 0)
neighborStartIndex[0] = 0;
}
/**
* Assemble the final neighbor list.
*/
__kernel void copyPairsToNeighborList(__global const int2* restrict neighborPairs, __global int* restrict neighbors, __global int* restrict numNeighborPairs,
int maxNeighborPairs, __global int* restrict numNeighborsForAtom, __global const int* restrict neighborStartIndex) {
KERNEL void copyPairsToNeighborList(GLOBAL const int2* RESTRICT neighborPairs, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT numNeighborPairs,
int maxNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL const int* RESTRICT neighborStartIndex) {
int actualPairs = *numNeighborPairs;
if (actualPairs > maxNeighborPairs)
return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for (unsigned int index = get_global_id(0); index < actualPairs; index += get_global_size(0)) {
for (unsigned int index = GLOBAL_ID; index < actualPairs; index += GLOBAL_SIZE) {
int2 pair = neighborPairs[index];
int startIndex = neighborStartIndex[pair.x];
int offset = atom_add(numNeighborsForAtom+pair.x, 1);
int offset = ATOMIC_ADD(numNeighborsForAtom+pair.x, 1);
neighbors[startIndex+offset] = pair.y;
}
}
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct {
real x, y, z;
real q;
......@@ -16,60 +12,69 @@ typedef struct {
* Find the maximum of a value across all threads in a warp, and return that to
* every thread.
*/
int reduceMax(int val, __local int* temp) {
int indexInWarp = get_local_id(0)%32;
temp[get_local_id(0)] = val;
DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// CUDA lets us do this slightly more efficiently by using shuffle operations.
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
return val;
#else
int indexInWarp = LOCAL_ID%32;
temp[LOCAL_ID] = val;
SYNC_WARPS;
for (int offset = 16; offset > 0; offset /= 2) {
if (offset < indexInWarp)
temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]);
temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]);
SYNC_WARPS;
}
return temp[get_local_id(0)-indexInWarp];
return temp[LOCAL_ID-indexInWarp];
#endif
}
#ifndef SUPPORTS_64_BIT_ATOMICS
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
localData[get_local_id(0)].x = atomIndex;
void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
localData[LOCAL_ID].x = atomIndex;
SYNC_WARPS;
real4 forceSum = (real4) 0;
int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
real4 forceSum = make_real4(0);
int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
int end = start+32;
bool isFirst = true;
for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= get_local_id(0));
isFirst &= (i >= LOCAL_ID);
}
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst)
forceBuffers[offset] += forceSum;
SYNC_WARPS;
}
#endif
__kernel void computeInteractionGroups(
KERNEL void computeInteractionGroups(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
__global real4* restrict forceBuffers,
GLOBAL real4* RESTRICT forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
__global int* restrict numGroupTiles, int useNeighborList,
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
mixed energy = 0;
INIT_DERIVATIVES
__local AtomData localData[LOCAL_MEMORY_SIZE];
__local int reductionBuffer[LOCAL_MEMORY_SIZE];
LOCAL AtomData localData[LOCAL_MEMORY_SIZE];
LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
......@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups(
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
real4 force = (real4) (0);
real3 force = make_real3(0);
real4 posq2 = posq[atom2];
localData[get_local_id(0)].x = posq2.x;
localData[get_local_id(0)].y = posq2.y;
localData[get_local_id(0)].z = posq2.z;
localData[get_local_id(0)].q = posq2.w;
localData[LOCAL_ID].x = posq2.x;
localData[LOCAL_ID].y = posq2.y;
localData[LOCAL_ID].z = posq2.z;
localData[LOCAL_ID].q = posq2.w;
LOAD_LOCAL_PARAMETERS
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[LOCAL_ID].fx = 0.0f;
localData[LOCAL_ID].fy = 0.0f;
localData[LOCAL_ID].fz = 0.0f;
int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
SYNC_WARPS;
......@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups(
if (j < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups(
COMPUTE_INTERACTION
energy += tempEnergy;
delta *= dEdR;
force.xyz -= delta.xyz;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[localIndex].fx += delta.x;
localData[localIndex].fy += delta.y;
localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF
}
#endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
}
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
#ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) {
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
}
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
SYNC_WARPS;
#else
writeForces(forceBuffers, localData, atom2);
localData[get_local_id(0)].fx = force.x;
localData[get_local_id(0)].fy = force.y;
localData[get_local_id(0)].fz = force.z;
localData[LOCAL_ID].fx = force.x;
localData[LOCAL_ID].fy = force.y;
localData[LOCAL_ID].fz = force.z;
writeForces(forceBuffers, localData, atom1);
#endif
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
SAVE_DERIVATIVES
}
......@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups(
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) {
KERNEL void prepareToBuildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles) {
if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0;
}
......@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles,
__global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData,
KERNEL void buildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles,
GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData, GLOBAL int4* RESTRICT filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
......@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (rebuildNeighborList[0] == 0)
return;
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex
__local real4 localPos[LOCAL_MEMORY_SIZE];
__local volatile bool anyInteraction[WARPS_IN_BLOCK];
__local volatile int tileIndex[WARPS_IN_BLOCK];
__local int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int local_warp = LOCAL_ID/TILE_SIZE; // local warpIndex
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
LOCAL real4 localPos[LOCAL_MEMORY_SIZE];
LOCAL volatile bool anyInteraction[WARPS_IN_BLOCK];
LOCAL volatile int tileIndex[WARPS_IN_BLOCK];
LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
......@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
localPos[get_local_id(0)] = posq[atom2];
localPos[LOCAL_ID] = posq[atom2];
if (tgx == 0)
anyInteraction[local_warp] = false;
int tj = tgx;
......@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
SYNC_WARPS;
for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
SYNC_WARPS;
if (j < rangeEnd) {
if (j < rangeEnd && tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);
real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (anyInteraction[local_warp]) {
SYNC_WARPS;
if (tgx == 0)
tileIndex[local_warp] = atomic_add(numGroupTiles, 1);
tileIndex[local_warp] = ATOMIC_ADD(numGroupTiles, 1);
SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
}
......
......@@ -4,10 +4,10 @@
/**
* Calculate the ellipsoid coordinate frames and associated matrices.
*/
extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix,
real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) {
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
KERNEL void computeEllipsoidFrames(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
GLOBAL const float4* RESTRICT sigParams, GLOBAL const float4* RESTRICT scale, GLOBAL real* RESTRICT aMatrix,
GLOBAL real* RESTRICT bMatrix, GLOBAL real* RESTRICT gMatrix, GLOBAL const int* sortedParticles) {
for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
// Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex];
......@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
// Compute matrices we will need later.
real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9);
real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9);
real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9);
GLOBAL real (*a)[3] = (GLOBAL real (*)[3]) (aMatrix+sortedIndex*9);
GLOBAL real (*b)[3] = (GLOBAL real (*)[3]) (bMatrix+sortedIndex*9);
GLOBAL real (*g)[3] = (GLOBAL real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x;
a[0][1] = xdir.y;
a[0][2] = xdir.z;
......@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter,
real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
KERNEL void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
GLOBAL const int* sortedAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter,
GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighborBlockCount) {
int index = GLOBAL_ID;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]];
......@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
index += GLOBAL_SIZE;
base = index*TILE_SIZE;
}
if (blockIdx.x*blockDim.x+threadIdx.x == 0)
if (GLOBAL_ID == 0)
*neighborBlockCount = 0;
}
/**
* This is called by findNeighbors() to write a block to the neighbor list.
*/
__device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) {
int blockIndex = atomicAdd(neighborBlockCount, 1);
DEVICE void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors,
GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount) {
int blockIndex = ATOMIC_ADD(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1;
......@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf
/**
* Build a list of neighbors for each atom.
*/
extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
KERNEL void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighbors,
GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1];
......@@ -178,8 +178,8 @@ typedef struct {
real a[3][3], b[3][3], g[3][3];
} AtomData;
__device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams,
const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) {
DEVICE void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, GLOBAL const real4* RESTRICT pos, GLOBAL const float4* RESTRICT sigParams,
GLOBAL const float2* RESTRICT epsParams, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix) {
data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex];
data->pos = trimTo3(pos[sortedIndex]);
......@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex,
}
}
inline __device__ real3 matrixVectorProduct(real (*m)[3], real3 v) {
inline DEVICE real3 matrixVectorProduct(real (*m)[3], real3 v) {
return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
}
inline __device__ real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
inline DEVICE real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
}
inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
inline DEVICE void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2];
......@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3])
result[2][2] = a[2][2]+b[2][2];
}
inline __device__ real determinant(real (*m)[3]) {
inline DEVICE real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
}
inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
inline DEVICE void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
......@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
}
__device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
DEVICE void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
real rInv = RSQRT(r2);
real r = r2*rInv;
real3 drUnit = dr*rInv;
......@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig
/**
* Compute the interactions.
*/
extern "C" __global__ void computeForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers,
int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos,
const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms,
const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix,
const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex,
const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams
KERNEL void computeForce(
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT torqueBuffers,
int numAtoms, int numExceptions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT pos,
GLOBAL const float4* RESTRICT sigParams, GLOBAL const float2* RESTRICT epsParams, GLOBAL const int* RESTRICT sortedAtoms,
GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix,
GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex,
GLOBAL const int4* RESTRICT exceptionParticles, GLOBAL const float2* RESTRICT exceptionParams
#ifdef USE_CUTOFF
, int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount,
, int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
mixed energy = 0;
#ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list.
for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) {
for (int block = GLOBAL_ID; block < numBlocks; block += GLOBAL_SIZE) {
// Load parameters for atom1.
int atom1 = neighborIndex[block];
......@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
}
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
}
#else
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
// Load parameters for atom1.
int index1 = sortedAtoms[atom1];
......@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
}
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
}
#endif
// Now compute exceptions.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) {
for (int index = GLOBAL_ID; index < numExceptions; index += GLOBAL_SIZE) {
int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y;
......@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce(
if (r2 < CUTOFF_SQUARED) {
#endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
#ifdef USE_CUTOFF
}
#endif
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
energyBuffer[GLOBAL_ID] += energy;
}
/**
* Convert the torques to forces on the connected particles.
*/
extern "C" __global__ void applyTorques(
unsigned long long* __restrict__ forceBuffers, long long* __restrict__ torqueBuffers,
int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
const int* sortedParticles) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
KERNEL void applyTorques(
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const mm_long* RESTRICT torqueBuffers,
int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
GLOBAL const int* sortedParticles) {
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
int originalIndex = sortedParticles[sortedIndex];
real3 pos = trimTo3(posq[originalIndex]);
int2 axisParticles = axisParticleIndices[originalIndex];
......@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques(
yforce += f;
force -= f;
}
atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[originalIndex], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.x], (mm_ulong) ((mm_long) (xforce.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.z*0x100000000)));
if (axisParticles.y != -1) {
atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.y], (mm_ulong) ((mm_long) (yforce.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.z*0x100000000)));
}
}
}
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
......@@ -13,26 +10,26 @@ typedef struct {
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(
KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
GLOBAL mm_ulong* RESTRICT global_bornSum,
#else
__global real* restrict global_bornSum,
GLOBAL real* RESTRICT global_bornSum,
#endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = LOCAL_ID - tgx;
LOCAL AtomData1 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
......@@ -42,7 +39,7 @@ __kernel void computeBornSum(
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real bornSum = 0.0f;
real bornSum = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
......@@ -50,15 +47,15 @@ __kernel void computeBornSum(
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = charge1;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
localData[LOCAL_ID].x = posq1.x;
localData[LOCAL_ID].y = posq1.y;
localData[LOCAL_ID].z = posq1.z;
localData[LOCAL_ID].q = charge1;
localData[LOCAL_ID].radius = params1.x;
localData[LOCAL_ID].scaledRadius = params1.y;
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -70,7 +67,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -91,21 +88,21 @@ __kernel void computeBornSum(
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j];
localData[LOCAL_ID].x = tempPosq.x;
localData[LOCAL_ID].y = tempPosq.y;
localData[LOCAL_ID].z = tempPosq.z;
localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f;
localData[LOCAL_ID].radius = tempParams.x;
localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[LOCAL_ID].bornSum = 0.0f;
SYNC_WARPS;
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -117,7 +114,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -151,17 +148,17 @@ __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (x != y)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
}
......@@ -172,17 +169,17 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[LOCAL_ID] = -1;
while (pos < end) {
real bornSum = 0;
......@@ -213,10 +210,10 @@ __kernel void computeBornSum(
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
......@@ -238,17 +235,17 @@ __kernel void computeBornSum(
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j];
localData[LOCAL_ID].x = tempPosq.x;
localData[LOCAL_ID].y = tempPosq.y;
localData[LOCAL_ID].z = tempPosq.z;
localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f;
localData[LOCAL_ID].radius = tempParams.x;
localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[LOCAL_ID].bornSum = 0.0f;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
......@@ -258,17 +255,17 @@ __kernel void computeBornSum(
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -304,7 +301,7 @@ __kernel void computeBornSum(
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -317,7 +314,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -350,20 +347,20 @@ __kernel void computeBornSum(
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
unsigned int atom2 = atomIndices[LOCAL_ID];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS)
atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
}
pos++;
......@@ -381,28 +378,28 @@ typedef struct {
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
__global const real* restrict global_bornRadii, int needEnergy,
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = LOCAL_ID - tgx;
mixed energy = 0;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
LOCAL AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
......@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1(
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0.0f;
real4 force = make_real4(0);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
......@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1(
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = charge1;
localData[get_local_id(0)].bornRadius = bornRadius1;
localData[LOCAL_ID].x = posq1.x;
localData[LOCAL_ID].y = posq1.y;
localData[LOCAL_ID].z = posq1.z;
localData[LOCAL_ID].q = charge1;
localData[LOCAL_ID].bornRadius = bornRadius1;
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real charge2 = localData[tbx+j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1(
#endif
if (needEnergy)
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
......@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1(
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
localData[LOCAL_ID].x = tempPosq.x;
localData[LOCAL_ID].y = tempPosq.y;
localData[LOCAL_ID].z = tempPosq.z;
localData[LOCAL_ID].q = charge[j];
localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[LOCAL_ID].fx = 0.0f;
localData[LOCAL_ID].fy = 0.0f;
localData[LOCAL_ID].fz = 0.0f;
localData[LOCAL_ID].fw = 0.0f;
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1(
#endif
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
......@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (x != y) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw;
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
}
......@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[LOCAL_ID] = -1;
while (pos < end) {
real4 force = 0;
real4 force = make_real4(0);
bool includeTile = true;
// Extract the coordinates of this tile.
......@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1(
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
......@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1(
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
localData[LOCAL_ID].x = tempPosq.x;
localData[LOCAL_ID].y = tempPosq.y;
localData[LOCAL_ID].z = tempPosq.z;
localData[LOCAL_ID].q = charge[j];
localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[LOCAL_ID].fx = 0.0f;
localData[LOCAL_ID].fy = 0.0f;
localData[LOCAL_ID].fz = 0.0f;
localData[LOCAL_ID].fw = 0.0f;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
......@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1(
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1(
#endif
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
......@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1(
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1(
#endif
if (needEnergy)
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
......@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1(
SYNC_WARPS;
}
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
unsigned int atom2 = atomIndices[LOCAL_ID];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
ATOMIC_ADD(&global_bornForce[atom1], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
atom_add(&global_bornForce[atom2], (long) (localData[get_local_id(0)].fw*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw;
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
}
......@@ -2,8 +2,8 @@
real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+OBC_PARAMS2.y;
real rScaledRadiusI = r+OBC_PARAMS1.y;
real l_ijJ = RECIP(max(OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max(OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ;
......@@ -16,12 +16,17 @@
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
#ifdef SUPPORTS_64_BIT_ATOMICS
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
#else
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#endif
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif
dEdR += (includeInteraction ? tempdEdR : 0);
dEdR += (includeInteraction ? tempdEdR : (real) 0);
}
......@@ -5,22 +5,21 @@
* Reduce the Born sums to compute the Born radii.
*/
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
KERNEL void reduceBornSum(float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornSum,
GLOBAL const mm_long* RESTRICT bornSum,
#else
__global const real* restrict bornSum,
GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
#endif
__global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = (1/(real) 0x100000000)*bornSum[index];
real sum = RECIP((real) 0x100000000)*bornSum[index];
#else
real sum = bornSum[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
#endif
......@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius);
real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
index += get_global_size(0);
}
}
......@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force.
*/
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce,
KERNEL void reduceBornForce(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn,
GLOBAL mm_long* RESTRICT bornForce,
#else
GLOBAL real* bornForce, int bufferSize, int numBuffers,
#endif
__global mixed* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
mixed energy = 0;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born force
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
real force = (1/(real) 0x100000000)*bornForceIn[index];
real force = RECIP((real) 0x100000000)*bornForce[index];
#else
real force = bornForce[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
#endif
......@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
float offsetRadius = params[index].x;
real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce[index] = (mm_long) (force*0x100000000);
#else
bornForce[index] = force;
index += get_global_size(0);
#endif
}
energyBuffer[get_global_id(0)] += energy/-6.0f;
energyBuffer[GLOBAL_ID] += energy/-6;
}
\ No newline at end of file
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct {
real x, y, z;
real q;
......@@ -12,27 +8,27 @@ typedef struct {
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(
KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
GLOBAL mm_long* RESTRICT global_bornSum,
#else
__global real* restrict global_bornSum,
GLOBAL real* RESTRICT global_bornSum,
#endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
__local AtomData1 localData[TILE_SIZE];
GLOBAL const ushort2* exclusionTiles) {
LOCAL AtomData1 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
......@@ -56,17 +52,17 @@ __kernel void computeBornSum(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0.0f;
real bornSum = 0;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(trimTo3(delta), trimTo3(delta));
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
......@@ -74,7 +70,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -92,9 +88,9 @@ __kernel void computeBornSum(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
......@@ -110,9 +106,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -124,7 +120,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -154,9 +150,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
......@@ -166,9 +162,9 @@ __kernel void computeBornSum(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[tgx].bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[offset], (mm_long) (localData[tgx].bornSum*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
......@@ -182,15 +178,15 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
LOCAL int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
......@@ -263,15 +259,15 @@ __kernel void computeBornSum(
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -301,9 +297,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
......@@ -319,9 +315,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -334,7 +330,7 @@ __kernel void computeBornSum(
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
......@@ -364,9 +360,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
......@@ -382,9 +378,9 @@ __kernel void computeBornSum(
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
ATOMIC_ADD(&global_bornSum[atom2], (mm_long) (localData[tgx].bornSum*0x100000000));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
......@@ -405,29 +401,29 @@ typedef struct {
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
__global const real* restrict global_bornRadii, int needEnergy,
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
GLOBAL const ushort2* exclusionTiles) {
mixed energy = 0;
__local AtomData2 localData[TILE_SIZE];
LOCAL AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
......@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 force = make_real4(0);
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
......@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1(
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 force = make_real4(0);
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x;
localData[j].fy += delta.y;
localData[j].fz += delta.z;
......@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
......@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
ATOMIC_ADD(&forceBuffers[offset], (mm_long) (localData[tgx].fx*0x100000000));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
ATOMIC_ADD(&global_bornForce[offset], (mm_long) (localData[tgx].fw*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
......@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
LOCAL int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
......@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1(
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 force = make_real4(0);
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
......@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x;
localData[j].fy += delta.y;
localData[j].fz += delta.z;
......@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
......@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 force = make_real4(0);
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0);
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
......@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x;
localData[j].fy += delta.y;
localData[j].fz += delta.z;
......@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
......@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1(
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[atom2], (long) (localData[tgx].fw*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2], (mm_long) (localData[tgx].fx*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
ATOMIC_ADD(&global_bornForce[atom2], (mm_long) (localData[tgx].fw*0x100000000));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
......@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1(
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
energyBuffer[GLOBAL_ID] += energy;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment