Unverified Commit f55abcaa authored by Evan Pretti's avatar Evan Pretti Committed by GitHub
Browse files

Add constant potential method (#4870)



* Initial implementation of C++ API

* Add kernel interface and information for API generation

* API updates for updating electrode parameters

* Add serialization proxy for ConstantPotentialForce

* Update file headers

* Add CG error tolerance and fix units on getCharges() return value

* Initial implementation of matrix solver

* Fixes and conjugate gradient solver

* Try to fix Linux and Windows builds

* Make sure charge constraint target is on total charge

* Restore handling of exceptions like NonbondedForce since they won't involve electrode atoms

* Ameliorate numerical instability in constrained conjugate gradient

* Fix uninitialized pointers, memory leak, and style

* Set CG tolerance units in Python API

* Test ConstantPotentialForce serialization

* Read/write ExceptionsUsePeriodicBoundaryConditions as bool

* Improve constrained conjugate gradient robustness to roundoff error accumulation

* Recompute matrix if electrode atoms move due to setPositions()

* Tolerance is now in gradient (potential) units again

* Add neutralizing background correction

* Add Python API tests

* Fixes for CG and nonbonded exceptions

* Add initial tests checking against existing NonbondedForce behavior

* Expand test suite and fix some implementation issues

* Add additional tests using larger reference system

* Add Gaussian test

* Finish test against reference computation

* CPU platform implementation

* Fixes for compilation on some platforms

* Fixes for constant potential with AVX/AVX2

* Test linking CPU PME library to constant potential test directly

* Older SWIG versions don't support Python set to C++ set conversion

* Add user guide entry

* Increase speed of reference test

* Conditional building constant potential CPU test is unreliable

* Debugging

* Miscellaneous fixes and improvements for CI

* Cache charges so solver will not run if system and coordinates have not changed

* Preconditioner flag, stability, and automatic detection improvements

* Add GPU platform-specific constant potential kernel classes

* PME and device-host I/O changes to support constant potential

* Initial common constant potential implementation

* Constant potential fixes:

* Fix preconditioner PME position/charge save/restore logic

* Fix reduction synchronization in constant potential solver kernels

* Add double-float accumulation for conjugate gradient solver when
  double unsupported by hardware

* Improve conditioning of a test system, and make sure particles are in or
out of cutoff for consistency and ease of comparing between platforms

* Reorder guess charges for CG when atom reordering changes positions

* Remove PME queue for now

* Trying to debug optimized direct space derivative kernel

* Remove extraneous debugging lines

* Style updates; just make CPU preconditioner double precision

* Debugging updated optimized direct derivatives kernel for all but OpenCL CPU

* OpenCL CPU implementation of direct space derivatives, and cleanup

* Try to make test even shorter to not time out on CI

* Temporary - Debugging

* Debugging

* Debugging

* Debugging

* Debugging

* Remove debugging code and fix reduction synchronization

* Fix other reductions

* Debugging - are tests hanging or just slow on CI?

* Debugging

* Debugging

* Fix macro for case when double precision is available on hardware

* Remove changes for debugging again

* Try to improve matrix solver cache locality by uploading transpose

* Fixes for atom ordering and periodic images

* Can't rely on reorder listener for cell offset updates

* Test reducing number of contexts and timing for CI

* Debugging

* Remove timing code and revert debugging changes

* Matrix solver and plasma term optimizations

* Reduce CG solver kernel calls and downloads

* Don't read back convergence flag from global memory

* Update PME due to refactoring in master branch

* Faster matrix solver (1st step)

* Faster matrix solver for CUDA

* Faster matrix solver compatibility with non-CUDA platforms

* Matrix solver fixes

* Use warp shuffle reductions when possible

* Attempt to work around intermittent compiler crash in Intel CPU OpenCL

* Optimize CG solver kernel 1

* Rework CG solver so some kernels can use more than 1 block

* Don't run out of shared memory

* Asynchronously download convergence flag while clearing buffers

---------
Co-authored-by: default avatarEvan Pretti <pretti@sh03-17n15.int>
parent 0ad62341
#define WARP_SIZE 32
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset)
#elif defined(USE_HIP)
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset)
#endif
#ifdef WARP_SHUFFLE_DOWN
#define TEMP_SIZE WARP_SIZE
#else
#define TEMP_SIZE THREAD_BLOCK_SIZE
#endif
#ifdef USE_HIP
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real x, y, z, q, width, derivative;
int ii;
} AtomData;
DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_THREADS;
}
#endif
return temp[0];
}
KERNEL void updateNonElectrodeCharges(
GLOBAL real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL real* RESTRICT nonElectrodeCharges,
GLOBAL int* RESTRICT sysElec
) {
for (int i = GLOBAL_ID; i < NUM_PARTICLES; i += GLOBAL_SIZE) {
if (sysElec[i] == -1) {
#ifdef USE_POSQ_CHARGES
posq[i].w = nonElectrodeCharges[i];
#else
charges[i] = nonElectrodeCharges[i];
#endif
}
}
}
KERNEL void updateElectrodeCharges(
GLOBAL real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL real* RESTRICT electrodeCharges,
GLOBAL int* RESTRICT elecToSys
) {
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
#ifdef USE_POSQ_CHARGES
posq[elecToSys[ii]].w = electrodeCharges[ii];
#else
charges[elecToSys[ii]] = electrodeCharges[ii];
#endif
}
}
KERNEL void getTotalCharge(
GLOBAL real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL real* RESTRICT totalChargeResult
) {
// This kernel expects to be executed in a single thread block.
LOCAL volatile real temp[TEMP_SIZE];
real totalCharge = 0;
for (int i = LOCAL_ID; i < NUM_PARTICLES; i += LOCAL_SIZE) {
#ifdef USE_POSQ_CHARGES
totalCharge += posq[i].w;
#else
totalCharge += charges[i];
#endif
}
totalCharge = reduceValue(totalCharge, temp);
if (LOCAL_ID == 0) {
totalChargeResult[0] = totalCharge;
}
}
KERNEL void evaluateSelfEnergyForces(
GLOBAL real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL int* RESTRICT sysElec,
GLOBAL real4* RESTRICT electrodeParams,
GLOBAL real* RESTRICT totalCharge,
GLOBAL int4* RESTRICT posCellOffsets,
real4 periodicBoxVecX,
real4 periodicBoxVecY,
real4 periodicBoxVecZ,
real4 externalField,
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL mm_ulong* RESTRICT forceBuffers
) {
if (GLOBAL_ID == 0) {
energyBuffer[0] -= PLASMA_SCALE * totalCharge[0] * totalCharge[0] / (periodicBoxVecX.x * periodicBoxVecY.y * periodicBoxVecZ.z * EWALD_ALPHA * EWALD_ALPHA);
}
for (int i = GLOBAL_ID; i < NUM_PARTICLES; i += GLOBAL_SIZE) {
const real4 pos = posq[i];
const int4 offset = posCellOffsets[i];
#ifdef USE_POSQ_CHARGES
const real charge = posq[i].w;
#else
const real charge = charges[i];
#endif
const real4 params = electrodeParams[sysElec[i] + 1];
const real4 posOffset = pos - offset.x * periodicBoxVecX - offset.y * periodicBoxVecY - offset.z * periodicBoxVecZ;
const real fieldTerm = posOffset.x * externalField.x + posOffset.y * externalField.y + posOffset.z * externalField.z;
energyBuffer[GLOBAL_ID] += charge * (charge * params.w - params.x - fieldTerm);
forceBuffers[i] += (mm_ulong) realToFixedPoint(charge * externalField.x);
forceBuffers[i + PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(charge * externalField.y);
forceBuffers[i + 2 * PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(charge * externalField.z);
}
}
DEVICE real evaluateDirectDerivative(real r2, real width1, real width2) {
const real r = SQRT(r2);
const real alphaR = EWALD_ALPHA * r;
const real etaR = r / SQRT(width1 * width1 + width2 * width2);
#ifdef USE_DOUBLE_PRECISION
const real erfcAlphaR = erfc(alphaR);
const real erfcEtaR = erfc(etaR);
#else
const real expAlphaRSqr = EXP(-alphaR * alphaR);
const real expEtaRSqr = EXP(-etaR * etaR);
const real tAlpha = RECIP(1.0f+0.3275911f*alphaR);
const real tEta = RECIP(1.0f+0.3275911f*etaR);
const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*tAlpha)*tAlpha)*tAlpha)*tAlpha)*tAlpha*expAlphaRSqr;
const real erfcEtaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*tEta)*tEta)*tEta)*tEta)*tEta*expEtaRSqr;
#endif
return ONE_4PI_EPS0 * (erfcAlphaR - erfcEtaR) / r;
}
KERNEL void evaluateDirectDerivatives(
GLOBAL const real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL int* RESTRICT sysToElec,
GLOBAL int* RESTRICT sysElec,
GLOBAL real4* RESTRICT electrodeParams,
GLOBAL mm_ulong* RESTRICT chargeDerivativesFixed,
real4 periodicBoxSize,
real4 invPeriodicBoxSize,
real4 periodicBoxVecX,
real4 periodicBoxVecY,
real4 periodicBoxVecZ,
GLOBAL const int2* RESTRICT exclusionTiles,
GLOBAL const int* RESTRICT tiles,
GLOBAL const unsigned int* RESTRICT interactionCount,
GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize,
GLOBAL const int* RESTRICT interactingAtoms,
unsigned int maxTiles
) {
#ifndef DEVICE_IS_CPU
// GPU-specific direct derivative calculation kernel.
const unsigned int totalWarps = GLOBAL_SIZE / TILE_SIZE;
const unsigned int warp = GLOBAL_ID / TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE - 1);
const unsigned int tbx = LOCAL_ID - tgx;
LOCAL AtomData localData[WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. Exclusions cannot
// involve electrode particles, so only self-interactions need be excluded.
const unsigned int firstExclusionTile = warp * NUM_EXCLUSION_TILES / totalWarps;
const unsigned int lastExclusionTile = (warp + 1) * NUM_EXCLUSION_TILES / totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
const unsigned int atom1 = x * TILE_SIZE + tgx;
const real4 posq1 = posq[atom1];
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
const int ii1 = sysToElec[atom1];
real derivative = 0;
if (x == y) {
// This tile is on the diagonal.
localData[LOCAL_ID].x = posq1.x;
localData[LOCAL_ID].y = posq1.y;
localData[LOCAL_ID].z = posq1.z;
localData[LOCAL_ID].q = charge1;
localData[LOCAL_ID].width = width1;
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (ii1 != -1 && atom1 < NUM_PARTICLES && y * TILE_SIZE + j < NUM_PARTICLES && atom1 != y * TILE_SIZE + j) {
const real3 pos2 = make_real3(localData[tbx + j].x, localData[tbx + j].y, localData[tbx + j].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
derivative += localData[tbx + j].q * evaluateDirectDerivative(r2, width1, localData[tbx + j].width);
}
}
SYNC_WARPS;
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y * TILE_SIZE + tgx;
const real4 posq2 = posq[j];
localData[LOCAL_ID].x = posq2.x;
localData[LOCAL_ID].y = posq2.y;
localData[LOCAL_ID].z = posq2.z;
#ifdef USE_POSQ_CHARGES
localData[LOCAL_ID].q = posq2.w;
#else
localData[LOCAL_ID].q = charges[j];
#endif
localData[LOCAL_ID].width = electrodeParams[sysElec[j] + 1].y;
localData[LOCAL_ID].derivative = 0;
localData[LOCAL_ID].ii = sysToElec[j];
SYNC_WARPS;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_PARTICLES && y * TILE_SIZE + tj < NUM_PARTICLES && (ii1 != -1 || localData[tbx + tj].ii != -1)) {
const real3 pos2 = make_real3(localData[tbx + tj].x, localData[tbx + tj].y, localData[tbx + tj].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[tbx + tj].width);
derivative += localData[tbx + tj].q * derivativeScale;
localData[tbx + tj].derivative += charge1 * derivativeScale;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
if (ii1 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
if (x != y) {
const int ii2 = localData[LOCAL_ID].ii;
if (ii2 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].derivative));
}
}
}
// Second loop: process tiles without exclusions.
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) {
// There wasn't enough memory for the neighbor list.
return;
}
int pos = (int) (warp * ((mm_long) numTiles) / totalWarps);
const int end = (int) ((warp + 1) * ((mm_long) numTiles) / totalWarps);
LOCAL int atomIndices[WORK_GROUP_SIZE];
while (pos < end) {
real derivative = 0;
// Extract the coordinates of this tile.
const int x = tiles[pos];
const real4 blockSizeX = blockSize[x];
const bool singlePeriodicCopy = (0.5f * periodicBoxSize.x - blockSizeX.x >= CUTOFF &&
0.5f * periodicBoxSize.y - blockSizeX.y >= CUTOFF &&
0.5f * periodicBoxSize.z - blockSizeX.z >= CUTOFF);
const unsigned int atom1 = x * TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
const int ii1 = sysToElec[atom1];
unsigned int j = interactingAtoms[pos * TILE_SIZE + tgx];
atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) {
const real4 posq2 = posq[j];
localData[LOCAL_ID].x = posq2.x;
localData[LOCAL_ID].y = posq2.y;
localData[LOCAL_ID].z = posq2.z;
#ifdef USE_POSQ_CHARGES
localData[LOCAL_ID].q = posq2.w;
#else
localData[LOCAL_ID].q = charges[j];
#endif
localData[LOCAL_ID].width = electrodeParams[sysElec[j] + 1].y;
localData[LOCAL_ID].derivative = 0;
localData[LOCAL_ID].ii = sysToElec[j];
}
SYNC_WARPS;
if (singlePeriodicCopy) {
// The box is small enough; we can translate atoms and avoid having
// to apply periodic boundary conditions to every interaction later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
const int atom2 = atomIndices[tbx + tj];
if (atom1 < NUM_PARTICLES && atom2 < NUM_PARTICLES && (ii1 != -1 || localData[tbx + tj].ii != -1)) {
const real3 pos2 = make_real3(localData[tbx + tj].x, localData[tbx + tj].y, localData[tbx + tj].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[tbx + tj].width);
derivative += localData[tbx + tj].q * derivativeScale;
localData[tbx + tj].derivative += charge1 * derivativeScale;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
else {
// We must apply periodic boundary conditions to every interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
const int atom2 = atomIndices[tbx + tj];
if (atom1 < NUM_PARTICLES && atom2 < NUM_PARTICLES && (ii1 != -1 || localData[tbx + tj].ii != -1)) {
const real3 pos2 = make_real3(localData[tbx + tj].x, localData[tbx + tj].y, localData[tbx + tj].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[tbx + tj].width);
derivative += localData[tbx + tj].q * derivativeScale;
localData[tbx + tj].derivative += charge1 * derivativeScale;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
if (ii1 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
if (atomIndices[LOCAL_ID] < PADDED_NUM_ATOMS) {
const int ii2 = localData[LOCAL_ID].ii;
if (ii2 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].derivative));
}
}
pos++;
}
#else
// CPU-specific direct derivative calculation kernel.
LOCAL AtomData localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. Exclusions cannot
// involve electrode particles, so only self-interactions need be excluded.
const unsigned int firstExclusionTile = GROUP_ID * NUM_EXCLUSION_TILES / NUM_GROUPS;
const unsigned int lastExclusionTile = (GROUP_ID + 1) * NUM_EXCLUSION_TILES / NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
// Load atom data for this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int j = y * TILE_SIZE + tgx;
const real4 posq2 = posq[j];
localData[tgx].x = posq2.x;
localData[tgx].y = posq2.y;
localData[tgx].z = posq2.z;
#ifdef USE_POSQ_CHARGES
localData[tgx].q = posq2.w;
#else
localData[tgx].q = charges[j];
#endif
localData[tgx].width = electrodeParams[sysElec[j] + 1].y;
localData[tgx].ii = sysToElec[j];
}
if (x == y) {
// This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int atom1 = x * TILE_SIZE + tgx;
const int ii1 = sysToElec[atom1];
if (ii1 == -1) {
continue;
}
const real4 posq1 = posq[atom1];
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
real derivative = 0;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_PARTICLES && y * TILE_SIZE + j < NUM_PARTICLES && atom1 != y * TILE_SIZE + j) {
const real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
derivative += localData[j].q * evaluateDirectDerivative(r2, width1, localData[j].width);
}
}
}
// Write results.
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
}
else {
// This is an off-diagonal tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].derivative = 0;
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int atom1 = x * TILE_SIZE + tgx;
const real4 posq1 = posq[atom1];
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
const int ii1 = sysToElec[atom1];
real derivative = 0;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_PARTICLES && y * TILE_SIZE + j < NUM_PARTICLES && (ii1 != -1 || localData[j].ii != -1)) {
const real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[j].width);
derivative += localData[j].q * derivativeScale;
localData[j].derivative += charge1 * derivativeScale;
}
}
}
// Write results for "1" atoms.
if (ii1 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
}
// Write results for "2" atoms.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const int ii2 = localData[tgx].ii;
if (ii2 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii2], (mm_ulong) realToFixedPoint(localData[tgx].derivative));
}
}
}
}
// Second loop: process tiles without exclusions.
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) {
// There wasn't enough memory for the neighbor list.
return;
}
int pos = (int) (GROUP_ID * ((mm_long) numTiles) / NUM_GROUPS);
const int end = (int) ((GROUP_ID + 1) * ((mm_long) numTiles) / NUM_GROUPS);
LOCAL int atomIndices[TILE_SIZE];
while (pos < end) {
real derivative = 0;
// Extract the coordinates of this tile.
const int x = tiles[pos];
const real4 blockSizeX = blockSize[x];
const bool singlePeriodicCopy = (0.5f * periodicBoxSize.x - blockSizeX.x >= CUTOFF &&
0.5f * periodicBoxSize.y - blockSizeX.y >= CUTOFF &&
0.5f * periodicBoxSize.z - blockSizeX.z >= CUTOFF);
// Load atom data for this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int j = interactingAtoms[pos * TILE_SIZE + tgx];
atomIndices[tgx] = j;
if (j < PADDED_NUM_ATOMS) {
const real4 posq2 = posq[j];
localData[tgx].x = posq2.x;
localData[tgx].y = posq2.y;
localData[tgx].z = posq2.z;
#ifdef USE_POSQ_CHARGES
localData[tgx].q = posq2.w;
#else
localData[tgx].q = charges[j];
#endif
localData[tgx].width = electrodeParams[sysElec[j] + 1].y;
localData[tgx].derivative = 0;
localData[tgx].ii = sysToElec[j];
}
}
if (singlePeriodicCopy) {
// The box is small enough; we can translate atoms and avoid having
// to apply periodic boundary conditions to every interaction later.
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int atom1 = x * TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
const int ii1 = sysToElec[atom1];
real derivative = 0;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
const int atom2 = atomIndices[j];
if (atom1 < NUM_PARTICLES && atom2 < NUM_PARTICLES && (ii1 != -1 || localData[j].ii != -1)) {
const real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[j].width);
derivative += localData[j].q * derivativeScale;
localData[j].derivative += charge1 * derivativeScale;
}
}
}
// Write results for "1" atoms.
if (ii1 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
}
}
else {
// We must apply periodic boundary conditions to every interaction.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
const unsigned int atom1 = x * TILE_SIZE + tgx;
const real4 posq1 = posq[atom1];
#ifdef USE_POSQ_CHARGES
const real charge1 = posq1.w;
#else
const real charge1 = charges[atom1];
#endif
const real width1 = electrodeParams[sysElec[atom1] + 1].y;
const int ii1 = sysToElec[atom1];
real derivative = 0;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
const int atom2 = atomIndices[j];
if (atom1 < NUM_PARTICLES && atom2 < NUM_PARTICLES && (ii1 != -1 || localData[j].ii != -1)) {
const real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real3 delta = make_real3(pos2.x - posq1.x, pos2.y - posq1.y, pos2.z - posq1.z);
APPLY_PERIODIC_TO_DELTA(delta)
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
if (r2 < CUTOFF_SQUARED) {
const real derivativeScale = evaluateDirectDerivative(r2, width1, localData[j].width);
derivative += localData[j].q * derivativeScale;
localData[j].derivative += charge1 * derivativeScale;
}
}
}
// Write results for "1" atoms.
if (ii1 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii1], (mm_ulong) realToFixedPoint(derivative));
}
}
}
// Write results for "2" atoms.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if (atomIndices[tgx] < PADDED_NUM_ATOMS) {
const int ii2 = localData[tgx].ii;
if (ii2 != -1) {
ATOMIC_ADD(&chargeDerivativesFixed[ii2], (mm_ulong) realToFixedPoint(localData[tgx].derivative));
}
}
}
pos++;
}
#endif
}
KERNEL void finishDerivatives(
GLOBAL real4* RESTRICT posq,
GLOBAL real* RESTRICT charges,
GLOBAL int* RESTRICT elecToSys,
GLOBAL int* RESTRICT elecElec,
GLOBAL real4* RESTRICT electrodeParams,
GLOBAL real* RESTRICT totalCharge,
GLOBAL int4* RESTRICT posCellOffsets,
real4 periodicBoxVecX,
real4 periodicBoxVecY,
real4 periodicBoxVecZ,
real4 externalField,
GLOBAL real* RESTRICT chargeDerivatives,
GLOBAL mm_long* RESTRICT chargeDerivativesFixed
) {
const real fixedScale = 1 / (real) 0x100000000;
const real plasmaScale = PLASMA_SCALE * totalCharge[0] / (periodicBoxVecX.x * periodicBoxVecY.y * periodicBoxVecZ.z * EWALD_ALPHA * EWALD_ALPHA);
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
int i = elecToSys[ii];
const real4 pos = posq[i];
const int4 offset = posCellOffsets[i];
#ifdef USE_POSQ_CHARGES
const real charge = pos.w;
#else
const real charge = charges[i];
#endif
const real4 params = electrodeParams[elecElec[ii] + 1];
const real4 posOffset = pos - offset.x * periodicBoxVecX - offset.y * periodicBoxVecY - offset.z * periodicBoxVecZ;
const real fieldTerm = posOffset.x * externalField.x + posOffset.y * externalField.y + posOffset.z * externalField.z;
chargeDerivatives[ii] += 2 * (charge * params.w - plasmaScale) - params.x - fieldTerm + fixedScale * chargeDerivativesFixed[ii];
}
}
#define WARP_SIZE 32
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset)
#elif defined(USE_HIP)
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset)
#endif
#ifdef WARP_SHUFFLE_DOWN
#define TEMP_SIZE WARP_SIZE
#else
#define TEMP_SIZE THREAD_BLOCK_SIZE
#endif
typedef struct {
real gradStepSq, qStepGradStep, qStepGrad, q, qStep;
} BlockSums1;
typedef struct {
real projGradSq, precGradStep, precGrad;
} BlockSums2;
// Sum value from each thread (using temp). Use real type variables (float on
// single and mixed precision modes, double on double precision mode).
DEVICE real reduceReal(real value, LOCAL_ARG volatile real* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_THREADS;
}
#endif
return temp[0];
}
// Performs the equivalent of reduceReal() on 5 values simultaneously.
DEVICE BlockSums1 reduceBlockSums1(BlockSums1 value, LOCAL_ARG BlockSums1* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value.gradStepSq += WARP_SHUFFLE_DOWN(value.gradStepSq, step);
value.qStepGradStep += WARP_SHUFFLE_DOWN(value.qStepGradStep, step);
value.qStepGrad += WARP_SHUFFLE_DOWN(value.qStepGrad, step);
value.q += WARP_SHUFFLE_DOWN(value.q, step);
value.qStep += WARP_SHUFFLE_DOWN(value.qStep, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value.gradStepSq = value.qStepGradStep = value.qStepGrad = value.q = value.qStep = 0;
if (lane < warpCount) {
value = temp[lane];
}
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value.gradStepSq += WARP_SHUFFLE_DOWN(value.gradStepSq, step);
value.qStepGradStep += WARP_SHUFFLE_DOWN(value.qStepGradStep, step);
value.qStepGrad += WARP_SHUFFLE_DOWN(value.qStepGrad, step);
value.q += WARP_SHUFFLE_DOWN(value.q, step);
value.qStep += WARP_SHUFFLE_DOWN(value.qStep, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread].gradStepSq += temp[thread + step].gradStepSq;
temp[thread].qStepGradStep += temp[thread + step].qStepGradStep;
temp[thread].qStepGrad += temp[thread + step].qStepGrad;
temp[thread].q += temp[thread + step].q;
temp[thread].qStep += temp[thread + step].qStep;
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread].gradStepSq += temp[thread + step].gradStepSq;
temp[thread].qStepGradStep += temp[thread + step].qStepGradStep;
temp[thread].qStepGrad += temp[thread + step].qStepGrad;
temp[thread].q += temp[thread + step].q;
temp[thread].qStep += temp[thread + step].qStep;
}
SYNC_THREADS;
}
#endif
return temp[0];
}
// Performs the equivalent of reduceReal() on 3 values simultaneously.
DEVICE BlockSums2 reduceBlockSums2(BlockSums2 value, LOCAL_ARG BlockSums2* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value.projGradSq += WARP_SHUFFLE_DOWN(value.projGradSq, step);
value.precGradStep += WARP_SHUFFLE_DOWN(value.precGradStep, step);
value.precGrad += WARP_SHUFFLE_DOWN(value.precGrad, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value.projGradSq = value.precGradStep = value.precGrad = 0;
if (lane < warpCount) {
value = temp[lane];
}
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value.projGradSq += WARP_SHUFFLE_DOWN(value.projGradSq, step);
value.precGradStep += WARP_SHUFFLE_DOWN(value.precGradStep, step);
value.precGrad += WARP_SHUFFLE_DOWN(value.precGrad, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread].projGradSq += temp[thread + step].projGradSq;
temp[thread].precGradStep += temp[thread + step].precGradStep;
temp[thread].precGrad += temp[thread + step].precGrad;
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread].projGradSq += temp[thread + step].projGradSq;
temp[thread].precGradStep += temp[thread + step].precGradStep;
temp[thread].precGrad += temp[thread + step].precGrad;
}
SYNC_THREADS;
}
#endif
return temp[0];
}
// We need more than single precision for accumulation regardless of the mode
// selected, so use double if double precision is supported, and otherwise use
// double-float arithmetic. In the latter case, use float2 with .x storing the
// "high" part and .y storing the "low" part of each double-float number.
#ifdef SUPPORTS_DOUBLE_PRECISION
#define ACCUM double
#define ACCUM_ZERO 0.0
// Perform accum = accum + real.
#define ACCUM_ADD(x, y) ((x) + (ACCUM) (y))
// Perform real = real + accum.
#define ACCUM_APPLY(x, y) ((real) ((ACCUM) (x) + (y)))
// Perform real = accum * real.
#define ACCUM_MUL(x, y) ((real) ((x) * (ACCUM) (y)))
// Perform accum = (accum * real) + accum.
#define ACCUM_MUL_ADD(x, y, z) (((x) * (ACCUM) (y)) + (z))
// Perform real = (real + accum) * accum.
#define ACCUM_ADD_MUL(x, y, z) ((real) (((ACCUM) (x) + (y)) * (z)))
// Sum value from each thread (using temp) and return (sum + offset) * scale.
DEVICE ACCUM reduceAccum(ACCUM value, LOCAL_ARG volatile ACCUM* temp, real offset, real scale) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_THREADS;
}
#endif
return (temp[0] + offset) * scale;
}
#else
#define ACCUM float2
#define ACCUM_ZERO make_float2(0.0f, 0.0f)
#define ACCUM_ADD(x, y) compensatedAdd2(x, y)
#define ACCUM_APPLY(x, y) compensatedAdd1(y, x)
#define ACCUM_MUL(x, y) compensatedMultiply1(x, y)
#define ACCUM_MUL_ADD(x, y, z) compensatedAdd3(compensatedMultiply2(x, y), z)
#define ACCUM_ADD_MUL(x, y, z) compensatedMultiply3(compensatedAdd2(y, x), z)
// For details of the compensated addition and multiplication implemented, see
// Joldes et al., ACM Trans. Math. Softw. 2017, 44, 15res (DOI: 10.1145/3121432).
// float + float -> float2, only valid if the floating-point exponent of a is
// not less than that of b.
DEVICE inline float2 compensatedAddKernel1(float a, float b) {
float s = a + b;
return make_float2(s, b - (s - a));
}
// float + float -> float2, valid for any inputs.
DEVICE inline float2 compensatedAddKernel2(float a, float b) {
float s = a + b;
float c = s - b;
float d = s - c;
return make_float2(s, (a - c) + (b - d));
}
// float * float -> float2.
DEVICE inline float2 compensatedMultiplyKernel(float a, float b) {
float c = a * b;
return make_float2(c, FMA(a, b, -c));
}
// float2 + float -> float. Like compensatedAdd2, but only computes the high
// part of the result.
DEVICE inline float compensatedAdd1(float2 x, float y) {
float2 s = compensatedAddKernel2(x.x, y);
return s.x + (x.y + s.y);
}
// float2 + float -> float2, with a relative error of 2^-47.
DEVICE inline float2 compensatedAdd2(float2 x, float y) {
float2 s = compensatedAddKernel2(x.x, y);
float v = x.y + s.y;
return compensatedAddKernel1(s.x, v);
}
// float2 + float2 -> float2, with a relative error of 2^-46.
DEVICE inline float2 compensatedAdd3(float2 x, float2 y) {
float2 s = compensatedAddKernel2(x.x, y.x);
float2 t = compensatedAddKernel2(x.y, y.y);
float c = s.y + t.x;
float2 v = compensatedAddKernel1(s.x, c);
float w = t.y + v.y;
return compensatedAddKernel1(v.x, w);
}
// float2 * float -> float. Like compensatedMultiply2, but only computes the
// high part of the result.
DEVICE inline float compensatedMultiply1(float2 x, float y) {
float c = x.x * y;
return c + (FMA(x.x, y, -c) + x.y * y);
}
// float2 * float -> float2, with a relative error of 2^-47.
DEVICE inline float2 compensatedMultiply2(float2 x, float y) {
float c = x.x * y;
return compensatedAddKernel1(c, FMA(x.x, y, -c) + x.y * y);
}
// float2 * float2 -> float.
DEVICE inline float compensatedMultiply3(float2 x, float2 y) {
float2 c = compensatedMultiplyKernel(x.x, y.x);
return c.x + (c.y + FMA(x.y, y.x, FMA(x.x, y.y, x.y * y.y)));
}
// Sum value from each thread (using temp) and return (sum + offset) * scale.
DEVICE ACCUM reduceAccum(ACCUM value, LOCAL_ARG volatile ACCUM* temp, real offset, real scale) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value = compensatedAdd3(value, WARP_SHUFFLE_DOWN(value, step));
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : ACCUM_ZERO;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value = compensatedAdd3(value, WARP_SHUFFLE_DOWN(value, step));
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] = compensatedAdd3(temp[thread], temp[thread + step]);
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] = compensatedAdd3(temp[thread], temp[thread + step]);
}
SYNC_THREADS;
}
#endif
return compensatedMultiply2(compensatedAdd2(temp[0], offset), scale);
}
#endif
KERNEL void solveInitializeStep1(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT qLast
#ifdef USE_CHARGE_CONSTRAINT
, real chargeTarget
#endif
) {
// This kernel expects to be executed in a single thread block.
#ifdef USE_CHARGE_CONSTRAINT
LOCAL volatile ACCUM tempAccum[TEMP_SIZE];
#endif
// Set initial guess charges as linear extrapolations from the current and
// previous charges fed through the solver, and save the current charges as
// the previous charges.
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
const real qGuess = electrodeCharges[ii];
electrodeCharges[ii] = 2 * qGuess - qLast[ii];
qLast[ii] = qGuess;
}
#ifdef USE_CHARGE_CONSTRAINT
// Ensure that initial guess charges satisfy the constraint.
ACCUM offsetAccum = ACCUM_ZERO;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
offsetAccum = ACCUM_ADD(offsetAccum, -electrodeCharges[ii]);
}
const ACCUM offset = reduceAccum(offsetAccum, tempAccum, chargeTarget, 1 / (real) NUM_ELECTRODE_PARTICLES);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] = ACCUM_APPLY(electrodeCharges[ii], offset);
}
#endif
}
KERNEL void solveInitializeStep2(GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT projGrad, GLOBAL int* RESTRICT convergedResult) {
// This kernel expects to be executed in a single thread block.
LOCAL volatile real temp[TEMP_SIZE];
#ifdef USE_CHARGE_CONSTRAINT
LOCAL volatile ACCUM tempAccum[TEMP_SIZE];
#endif
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
grad[ii] = chargeDerivatives[ii];
}
#ifdef USE_CHARGE_CONSTRAINT
// Project the initial gradient without preconditioning.
ACCUM offsetAccum = ACCUM_ZERO;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
offsetAccum = ACCUM_ADD(offsetAccum, grad[ii]);
}
const ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1 / (real) NUM_ELECTRODE_PARTICLES);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
projGrad[ii] = ACCUM_APPLY(grad[ii], offset);
}
#else
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
projGrad[ii] = grad[ii];
}
#endif
// Check for convergence at the initial guess charges.
real error = 0;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
error += projGrad[ii] * projGrad[ii];
}
error = reduceReal(error, temp);
if (LOCAL_ID == 0) {
convergedResult[0] = (int) (error <= ERROR_TARGET);
}
}
KERNEL void solveInitializeStep3(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT projGrad,
GLOBAL real* RESTRICT precGrad, GLOBAL real* RESTRICT qStep, GLOBAL real* RESTRICT grad0
#ifdef PRECOND_REQUESTED
, GLOBAL ACCUM* RESTRICT precondVector, int precondActivated
#endif
) {
// This kernel expects to be executed in a single thread block.
#if defined(PRECOND_REQUESTED) && defined(USE_CHARGE_CONSTRAINT)
LOCAL volatile ACCUM tempAccum[TEMP_SIZE];
#endif
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
grad0[ii] = chargeDerivatives[ii];
}
#ifdef PRECOND_REQUESTED
// Project the initial gradient with preconditioning.
if (precondActivated) {
#ifdef USE_CHARGE_CONSTRAINT
ACCUM offsetAccum = ACCUM_ZERO;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
offsetAccum = ACCUM_MUL_ADD(precondVector[ii], grad[ii], offsetAccum);
}
const ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = ACCUM_ADD_MUL(grad[ii], offset, precondVector[ii]);
}
#else
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = ACCUM_MUL(precondVector[ii], grad[ii]);
}
#endif
}
else {
#endif
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = projGrad[ii];
}
#ifdef PRECOND_REQUESTED
}
#endif
// Initialize step vector for conjugate gradient iterations.
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] = qStep[ii] = -precGrad[ii];
}
}
KERNEL void solveLoopStep1(
GLOBAL real* RESTRICT chargeDerivatives,
GLOBAL real* RESTRICT q,
GLOBAL real* RESTRICT grad,
GLOBAL real* RESTRICT qStep,
GLOBAL real* RESTRICT gradStep,
GLOBAL real* RESTRICT grad0,
GLOBAL BlockSums1* RESTRICT blockSums1Buffer
) {
// This kernel can be executed across multiple thread blocks.
LOCAL BlockSums1 temp[TEMP_SIZE];
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
gradStep[ii] = chargeDerivatives[ii] - grad0[ii];
}
// Reduce values within each block and store results.
BlockSums1 blockSums1 = {0, 0, 0, 0, 0};
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
blockSums1.gradStepSq += gradStep[ii] * gradStep[ii];
blockSums1.qStepGradStep += qStep[ii] * gradStep[ii];
blockSums1.qStepGrad += qStep[ii] * grad[ii];
blockSums1.q += q[ii];
blockSums1.qStep += qStep[ii];
}
blockSums1 = reduceBlockSums1(blockSums1, temp);
if (LOCAL_ID == 0) {
blockSums1Buffer[GROUP_ID] = blockSums1;
}
}
KERNEL void solveLoopStep2(
GLOBAL BlockSums1* RESTRICT blockSums1Buffer,
GLOBAL int* RESTRICT convergedResult
) {
// This kernel expects to be executed in a single thread block.
LOCAL BlockSums1 tempSums[TEMP_SIZE];
// Reduce values from all blocks.
BlockSums1 blockSums1 = {0, 0, 0, 0, 0};
for (int ii = LOCAL_ID; ii < THREAD_BLOCK_COUNT; ii += LOCAL_SIZE) {
blockSums1.gradStepSq += blockSums1Buffer[ii].gradStepSq;
blockSums1.qStepGradStep += blockSums1Buffer[ii].qStepGradStep;
blockSums1.qStepGrad += blockSums1Buffer[ii].qStepGrad;
blockSums1.q += blockSums1Buffer[ii].q;
blockSums1.qStep += blockSums1Buffer[ii].qStep;
}
blockSums1 = reduceBlockSums1(blockSums1, tempSums);
if (LOCAL_ID == 0) {
blockSums1Buffer[0] = blockSums1;
// If A qStep is small enough, stop to prevent, e.g., division by zero
// in the calculation of alpha, or too large step sizes.
convergedResult[0] = (int) (blockSums1.gradStepSq <= ERROR_TARGET);
}
}
KERNEL void solveLoopStep3(
GLOBAL real* RESTRICT q,
GLOBAL real* RESTRICT grad,
GLOBAL real* RESTRICT qStep,
GLOBAL real* RESTRICT gradStep,
GLOBAL BlockSums1* RESTRICT blockSums1Buffer,
GLOBAL int* RESTRICT convergedResult
#ifdef USE_CHARGE_CONSTRAINT
, real chargeTarget
#endif
) {
// This kernel can be executed across multiple thread blocks.
if (convergedResult[0] != 0) {
return;
}
const BlockSums1 blockSums1 = blockSums1Buffer[0];
const real alpha = -blockSums1.qStepGrad / blockSums1.qStepGradStep;
// Update the charge vector.
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
q[ii] += alpha * qStep[ii];
}
#ifdef USE_CHARGE_CONSTRAINT
// Remove any accumulated drift from the charge vector. This would be zero
// in exact arithmetic, but error can accumulate over time in finite
// precision.
const real offset = (chargeTarget - (blockSums1.q + alpha * blockSums1.qStep)) / NUM_ELECTRODE_PARTICLES;
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
q[ii] += offset;
}
#endif
// Update the gradient vector. If on this iteration, the gradient is to be
// recomputed, the contents of grad will be overwritten.
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
grad[ii] += alpha * gradStep[ii];
}
}
KERNEL void solveLoopStep4(
GLOBAL real* RESTRICT grad,
GLOBAL real* RESTRICT projGrad,
GLOBAL real* RESTRICT precGrad,
GLOBAL real* RESTRICT gradStep,
GLOBAL BlockSums2* RESTRICT blockSums2Buffer,
GLOBAL int* RESTRICT convergedResult
#ifdef PRECOND_REQUESTED
, GLOBAL ACCUM* RESTRICT precondVector,
int precondActivated
#endif
) {
// This kernel expects to be executed in a single thread block.
if (convergedResult[0] != 0) {
return;
}
LOCAL volatile real temp[TEMP_SIZE];
LOCAL BlockSums2 tempSums[TEMP_SIZE];
#ifdef USE_CHARGE_CONSTRAINT
LOCAL volatile ACCUM tempAccum[TEMP_SIZE];
#endif
// Project the current gradient without preconditioning.
#ifdef USE_CHARGE_CONSTRAINT
ACCUM offsetAccum = ACCUM_ZERO;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
offsetAccum = ACCUM_ADD(offsetAccum, grad[ii]);
}
ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1 / (real) NUM_ELECTRODE_PARTICLES);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
projGrad[ii] = ACCUM_APPLY(grad[ii], offset);
}
#else
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
projGrad[ii] = grad[ii];
}
#endif
// Project the current gradient with preconditioning.
#ifdef PRECOND_REQUESTED
if (precondActivated) {
#ifdef USE_CHARGE_CONSTRAINT
offsetAccum = ACCUM_ZERO;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
offsetAccum = ACCUM_MUL_ADD(precondVector[ii], grad[ii], offsetAccum);
}
offset = reduceAccum(offsetAccum, tempAccum, 0, -1);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = ACCUM_ADD_MUL(grad[ii], offset, precondVector[ii]);
}
#else
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = ACCUM_MUL(precondVector[ii], grad[ii]);
}
#endif
}
else {
#endif
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
precGrad[ii] = projGrad[ii];
}
#ifdef PRECOND_REQUESTED
}
#endif
// Reduce values to be used by all blocks in the final kernel.
BlockSums2 blockSums2 = {0, 0, 0};
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
blockSums2.projGradSq += projGrad[ii] * projGrad[ii];
blockSums2.precGradStep += precGrad[ii] * gradStep[ii];
blockSums2.precGrad += precGrad[ii];
}
blockSums2 = reduceBlockSums2(blockSums2, tempSums);
if (LOCAL_ID == 0) {
blockSums2Buffer[0] = blockSums2;
convergedResult[0] = (int) (blockSums2.projGradSq <= ERROR_TARGET);
}
}
KERNEL void solveLoopStep5(
GLOBAL real* RESTRICT electrodeCharges,
GLOBAL real* RESTRICT precGrad,
GLOBAL real* RESTRICT qStep,
GLOBAL BlockSums1* RESTRICT blockSums1Buffer,
GLOBAL BlockSums2* RESTRICT blockSums2Buffer,
GLOBAL int* RESTRICT convergedResult
) {
// This kernel can be executed across multiple thread blocks.
if (convergedResult[0] != 0) {
return;
}
const BlockSums1 blockSums1 = blockSums1Buffer[0];
const BlockSums2 blockSums2 = blockSums2Buffer[0];
// Evaluate the conjugate gradient parameter beta.
const real beta = blockSums2.precGradStep / blockSums1.qStepGradStep;
// Update the step vector.
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
qStep[ii] = beta * qStep[ii] - precGrad[ii];
}
#ifdef USE_CHARGE_CONSTRAINT
// Project out any deviation off of the constraint plane from the step
// vector. This would be zero in exact arithmetic, but error can accumulate
// over time in finite precision.
const real offset = (beta * blockSums1.qStep - blockSums2.precGrad) / NUM_ELECTRODE_PARTICLES;
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
qStep[ii] -= offset;
}
#endif
// Prepare for the next derivative calculation.
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
electrodeCharges[ii] = qStep[ii];
}
}
// The approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7.
if (!isExcluded && r2 < CUTOFF_SQUARED) {
const real prefactor = ONE_4PI_EPS0 * CHARGE1 * CHARGE2 * invR;
const real alphaR = EWALD_ALPHA * r;
const real expAlphaRSqr = EXP(-alphaR * alphaR);
#ifdef USE_DOUBLE_PRECISION
const real erfcAlphaR = erfc(alphaR);
#else
const real tAlpha = RECIP(1.0f+0.3275911f*alphaR);
const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*tAlpha)*tAlpha)*tAlpha)*tAlpha)*tAlpha*expAlphaRSqr;
#endif
real tempForceScale = erfcAlphaR + TWO_OVER_SQRT_PI * alphaR * expAlphaRSqr;
real tempEnergyScale = erfcAlphaR;
if (SYSELEC1 != -1 || SYSELEC2 != -1) {
const real4 params1 = PARAMS[SYSELEC1 + 1];
const real4 params2 = PARAMS[SYSELEC2 + 1];
const real etaR = r / SQRT(params1.y * params1.y + params2.y * params2.y);
const real expEtaRSqr = EXP(-etaR * etaR);
#ifdef USE_DOUBLE_PRECISION
const real erfcEtaR = erfc(etaR);
#else
const real tEta = RECIP(1.0f+0.3275911f*etaR);
const real erfcEtaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*tEta)*tEta)*tEta)*tEta)*tEta*expEtaRSqr;
#endif
tempForceScale -= erfcEtaR + TWO_OVER_SQRT_PI * etaR * expEtaRSqr;
tempEnergyScale -= erfcEtaR;
}
tempEnergy += prefactor * tempEnergyScale;
dEdR += prefactor * tempForceScale * invR * invR;
}
const real exceptionScale = PARAMS[index];
real3 delta = make_real3(pos2.x - pos1.x, pos2.y - pos1.y, pos2.z - pos1.z);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
const real invR = RSQRT(r2);
const real tempEnergy = exceptionScale * invR;
const real tempForce = tempEnergy * invR * invR;
energy += tempEnergy;
delta *= tempForce;
real3 force1 = -delta;
real3 force2 = delta;
const real exclusionScale = PARAMS[index];
real3 delta = make_real3(pos2.x - pos1.x, pos2.y - pos1.y, pos2.z - pos1.z);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
const real r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
const real r = SQRT(r2);
const real invR = RECIP(r);
const real alphaR = EWALD_ALPHA * r;
real tempForce = 0.0f;
if (alphaR > 1e-6f) {
const real erfAlphaR = ERF(alphaR);
const real prefactor = exclusionScale * invR;
tempForce = prefactor * (erfAlphaR - TWO_OVER_SQRT_PI * alphaR * EXP(-alphaR * alphaR)) * invR * invR;
energy -= prefactor * erfAlphaR;
}
else {
energy -= TWO_OVER_SQRT_PI * EWALD_ALPHA * exclusionScale;
}
delta *= tempForce;
real3 force1 = delta;
real3 force2 = -delta;
#define WARP_SIZE 32
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#define WARP_SHUFFLE(local, index) __shfl_sync(0xffffffff, local, index)
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset)
#elif defined(USE_HIP)
#define WARP_SHUFFLE(local, index) __shfl(local, index)
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset)
#endif
#ifdef WARP_SHUFFLE_DOWN
#define TEMP_SIZE WARP_SIZE
#else
#define TEMP_SIZE THREAD_BLOCK_SIZE
#endif
DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_THREADS;
}
#endif
return temp[0];
}
KERNEL void checkSavedElectrodePositions(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT electrodePosData, GLOBAL int* RESTRICT elecToSys, GLOBAL int* RESTRICT result) {
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
real4 posqPosition = posq[elecToSys[ii]];
real4 savedPosition = electrodePosData[ii];
if (posqPosition.x != savedPosition.x || posqPosition.y != savedPosition.y || posqPosition.z != savedPosition.z) {
*result = 1;
break;
}
}
}
KERNEL void saveElectrodePositions(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT electrodePosData, GLOBAL int* RESTRICT elecToSys) {
for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) {
electrodePosData[ii] = posq[elecToSys[ii]];
}
}
KERNEL void solve(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT capacitance
#ifdef USE_CHARGE_CONSTRAINT
, GLOBAL real* RESTRICT constraintVector, real chargeTarget
#endif
) {
// This kernel expects to be executed in a single thread block.
#if CHUNK_SIZE > 1
LOCAL volatile real chunkCharges[CHUNK_SIZE];
#endif
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] = -chargeDerivatives[ii];
}
SYNC_THREADS;
// Cholesky solve step 1 (outer loop over chunks of rows).
for (int jj = 0; jj < PADDED_PROBLEM_SIZE; jj += CHUNK_SIZE) {
if (LOCAL_ID < CHUNK_SIZE) {
#if CHUNK_SIZE > 1
#ifdef WARP_SHUFFLE
real threadCharge = electrodeCharges[jj + LOCAL_ID];
for (int k = 0; k < CHUNK_SIZE - 1; k++) {
const real chargeShuffled = WARP_SHUFFLE(threadCharge, k);
if (LOCAL_ID > k) {
threadCharge -= chargeShuffled * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)];
}
}
SYNC_WARPS;
electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID] = threadCharge;
#else
chunkCharges[LOCAL_ID] = electrodeCharges[jj + LOCAL_ID];
for (int k = 0; k < CHUNK_SIZE - 1; k++) {
SYNC_WARPS;
if (LOCAL_ID > k) {
chunkCharges[LOCAL_ID] -= chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)];
}
}
SYNC_WARPS;
electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID];
#endif
#endif
}
SYNC_THREADS;
for (int ii = jj + CHUNK_SIZE + LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
#if CHUNK_SIZE > 1
real chargeOffset = 0;
for (int k = 0; k < CHUNK_SIZE; k++) {
chargeOffset += chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + ii];
}
electrodeCharges[ii] -= chargeOffset;
#else
electrodeCharges[ii] -= electrodeCharges[jj] * capacitance[(mm_long) jj * PADDED_PROBLEM_SIZE + ii];
#endif
}
SYNC_THREADS;
}
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] *= capacitance[(mm_long) ii * PADDED_PROBLEM_SIZE + ii];
}
SYNC_THREADS;
// Cholesky solve step 2 (outer loop over chunks of columns).
for (int jj = PADDED_PROBLEM_SIZE - CHUNK_SIZE; jj >= 0; jj -= CHUNK_SIZE) {
if (LOCAL_ID < CHUNK_SIZE) {
#if CHUNK_SIZE > 1
#ifdef WARP_SHUFFLE
real threadCharge = electrodeCharges[jj + LOCAL_ID];
for (int k = CHUNK_SIZE - 1; k >= 0; k--) {
const real chargeShuffled = WARP_SHUFFLE(threadCharge, k);
if (LOCAL_ID < k) {
threadCharge -= chargeShuffled * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)];
}
}
SYNC_WARPS;
electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID] = threadCharge;
#else
chunkCharges[LOCAL_ID] = electrodeCharges[jj + LOCAL_ID];
for (int k = CHUNK_SIZE - 1; k >= 0; k--) {
SYNC_WARPS;
if (LOCAL_ID < k) {
chunkCharges[LOCAL_ID] -= chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)];
}
}
SYNC_WARPS;
electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID];
#endif
#endif
}
SYNC_THREADS;
for (int ii = LOCAL_ID; ii < jj; ii += LOCAL_SIZE) {
#if CHUNK_SIZE > 1
real chargeOffset = 0;
for (int k = 0; k < CHUNK_SIZE; k++) {
chargeOffset += chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + ii];
}
electrodeCharges[ii] -= chargeOffset;
#else
electrodeCharges[ii] -= electrodeCharges[jj] * capacitance[(mm_long) jj * PADDED_PROBLEM_SIZE + ii];
#endif
}
SYNC_THREADS;
}
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] *= capacitance[(mm_long) ii * PADDED_PROBLEM_SIZE + ii];
}
SYNC_THREADS;
#ifdef USE_CHARGE_CONSTRAINT
LOCAL volatile real temp[TEMP_SIZE];
real chargeOffset = 0;
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
chargeOffset -= electrodeCharges[ii];
}
chargeOffset = chargeTarget + reduceValue(chargeOffset, temp);
for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) {
electrodeCharges[ii] += chargeOffset * constraintVector[ii];
}
#endif
}
KERNEL void checkSavedPositions(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT savedPositions, GLOBAL int* RESTRICT result) {
for (int i = GLOBAL_ID; i < NUM_PARTICLES; i += GLOBAL_SIZE) {
real4 posqPosition = posq[i];
real4 savedPosition = savedPositions[i];
if (posqPosition.x != savedPosition.x || posqPosition.y != savedPosition.y || posqPosition.z != savedPosition.z) {
*result = 1;
break;
}
}
}
...@@ -179,6 +179,8 @@ KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const r ...@@ -179,6 +179,8 @@ KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const r
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kx != 0 || ky != 0 || kz != 0) { if (kx != 0 || ky != 0 || kz != 0) {
pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm); pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
} else {
pmeGrid[index] = make_real2(0);
} }
#endif #endif
} }
...@@ -351,6 +353,77 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul ...@@ -351,6 +353,77 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
} }
} }
KERNEL void gridInterpolateChargeDerivatives(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT derivatives, GLOBAL const real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int* RESTRICT atomIndices
) {
real3 data[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1));
for (int i = GLOBAL_ID; i < NUM_INDICES; i += GLOBAL_SIZE) {
int atom = atomIndices[i];
real derivative = 0;
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*GRID_SIZE_X;
t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP((real) (j-1));
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the charge derivative on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
derivative += dx*dy*data[iz].z*pmeGrid[ybase + zindex];
}
}
}
derivative *= EPSILON_FACTOR;
#ifdef USE_PME_STREAM
ATOMIC_ADD(&derivatives[i], (mm_ulong) realToFixedPoint(derivative));
#else
derivatives[i] += (mm_ulong) realToFixedPoint(derivative);
#endif
}
}
KERNEL void addForces(GLOBAL const real4* RESTRICT forces, GLOBAL mm_long* RESTRICT forceBuffers) { KERNEL void addForces(GLOBAL const real4* RESTRICT forces, GLOBAL mm_long* RESTRICT forceBuffers) {
for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) { for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) {
real4 f = forces[atom]; real4 f = forces[atom];
......
#ifndef OPENMM_CPUCONSTANTPOTENTIALFORCE_H_
#define OPENMM_CPUCONSTANTPOTENTIALFORCE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <array>
#include "CpuNeighborList.h"
#include "openmm/Kernel.h"
#include "openmm/kernels.h"
#include "openmm/internal/vectorize.h"
#include "tnt_array2d.h"
#include "jama_cholesky.h"
namespace OpenMM {
class CpuConstantPotentialForce;
/**
* A generic charge solver for the constant potential method.
*/
class CpuConstantPotentialSolver {
public:
/**
* Creates a CpuConstantPotentialSolver.
*
* @param numParticles the number of particles
* @param numElectrodeParticles the number of electrode (fluctuating-charge) particles
*/
CpuConstantPotentialSolver(int numParticles, int numElectrodeParticles);
virtual ~CpuConstantPotentialSolver();
/**
* Mark precomputed data stored by the solver as invalid due to a change in
* electrode parameters.
*/
void invalidate();
/**
* Mark any existing solution stored by the solver as invalid due to a
* change in system parameters.
*/
void discardSavedSolution();
/**
* Solves for charges if needed.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void solve(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel);
/**
* Solves for charges.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
virtual void solveImpl(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) = 0;
protected:
int numParticles, numElectrodeParticles;
bool valid, hasSavedSolution;
Vec3 savedBoxVectors[3];
std::vector<Vec3> savedPositions;
std::vector<float> savedCharges;
};
/**
* A constant potential solver using direct inversion of the Coulomb matrix.
* Suitable only when electrode particle positions are fixed.
*/
class CpuConstantPotentialMatrixSolver : public CpuConstantPotentialSolver {
private:
Vec3 boxVectors[3];
std::vector<Vec3> electrodePosData;
JAMA::Cholesky<float> capacitance;
std::vector<float> constraintVector;
TNT::Array1D<float> b;
public:
/**
* Creates a CpuConstantPotentialMatrixSolver.
*
* @param numParticles the number of particles
* @param numElectrodeParticles the number of electrode (fluctuating-charge) particles
*/
CpuConstantPotentialMatrixSolver(int numParticles, int numElectrodeParticles);
/**
* Solves for charges.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void solveImpl(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel);
private:
/**
* Ensures that precomputed data stored by the solver is valid.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void ensureValid(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel);
};
/**
* A constant potential solver using the conjugate gradient method. Suitable
* for both fixed and variable electrode particle positions.
*/
class CpuConstantPotentialCGSolver : public CpuConstantPotentialSolver {
private:
Vec3 boxVectors[3];
bool precondRequested, precondActivated;
std::vector<double> precondVector;
std::vector<float> q;
std::vector<float> grad;
std::vector<float> projGrad;
std::vector<float> precGrad;
std::vector<float> qStep;
std::vector<float> gradStep;
std::vector<float> grad0;
std::vector<float> qLast;
public:
/**
* Creates a CpuConstantPotentialCGSolver.
*
* @param numParticles the number of particles
* @param numElectrodeParticles the number of electrode (fluctuating-charge) particles
* @param precond whether or not to use a preconditioner
*/
CpuConstantPotentialCGSolver(int numParticles, int numElectrodeParticles, bool precond);
/**
* Solves for charges.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void solveImpl(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel);
private:
/**
* Ensures that precomputed data stored by the solver is valid.
*
* @param conp CPU constant potential force implementation
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void ensureValid(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel);
};
/**
* Performs energy, force, and charge derivative calculations for the CPU kernel
* for ConstantPotentialForce.
*/
class CpuConstantPotentialForce {
friend class CpuConstantPotentialSolver;
friend class CpuConstantPotentialMatrixSolver;
friend class CpuConstantPotentialCGSolver;
public:
CpuConstantPotentialForce();
virtual ~CpuConstantPotentialForce();
/**
* Initializes CpuConstantPotentialForce with data that cannot vary after
* context creation, or pointers to data that can vary.
*
* @param numParticles the total number of particles
* @param numElectrodeParticles the number of electrode (fluctuating-charge) particles
* @param posqIndex index to identify charges loaded into the posq array
* @param nonbondedCutoff direct space cutoff
* @param ewaldAlpha Ewald reciprocal Gaussian width parameter
* @param cgErrorTol constant potential conjugate gradient error tolerance
* @param gridSize Ewald mesh dimensions
* @param exceptionsArePeriodic whether or not exceptions use periodic boundary conditions
* @param useChargeConstraint whether or not to constrain total charge
* @param neighborList neighbor list for direct space calculation
* @param solver charge solver implementation
* @param exclusions particle exclusions
* @param sysToElec mapping from system particle indices to electrode particle indices
* @param elecToSys mapping from electrode particle indices to system particle indices
* @param sysElec mapping from system particle indices to electrode indices
* @param elecElec mapping from electrode particle indices to electrode indices
* @param electrodeParams electrode parameters
* @param chargeTarget target sum of charges on electrode particles only
* @param externalField electric field vector
*/
void initialize(int numParticles, int numElectrodeParticles, int posqIndex,
float nonbondedCutoff, float ewaldAlpha, float cgErrorTol,
const int* gridSize, bool exceptionsArePeriodic,
bool useChargeConstraint, const CpuNeighborList& neighborList,
CpuConstantPotentialSolver* solver,
const std::vector<std::set<int> >& exclusions,
const std::vector<int>& sysToElec, const std::vector<int>& elecToSys,
const std::vector<int>& sysElec, const std::vector<int>& elecElec,
const std::vector<std::array<double, 3> >& electrodeParams,
float chargeTarget, const float* externalField);
/**
* Updates CpuConstantPotentialForce after data has been modified.
*
* @param chargeTarget target sum of charges on electrode particles only
* @param externalField electric field vector
* @param firstElectrode index of the first electrode whose parameters may have been modified
* @param lastElectrode index of the last electrode whose parameters may have been modified
*/
void update(float chargeTarget, const float* externalField,
int firstElectrode, int lastElectrode);
/**
* Solves for charges and computes energies and forces.
*
* @param boxVectors periodic box vectors
* @param posData particle positions
* @param charges output particle charges
* @param posq wrapped single-precision position and charge array
* @param threadForce per-thread force accumulation arrays
* @param energy output system energy
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void execute(const Vec3* boxVectors, const std::vector<Vec3>& posData,
std::vector<float>& charges, float* posq,
std::vector<AlignedArray<float> >& threadForce, double* energy,
ThreadPool& threads, Kernel& pmeKernel);
/**
* Solves for charges without computing energies and forces.
*
* @param boxVectors periodic box vectors
* @param posData particle positions
* @param charges output particle charges
* @param posq wrapped single-precision position and charge array
* @param threadForce per-thread force accumulation arrays
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void getCharges(const Vec3* boxVectors, const std::vector<Vec3>& posData,
std::vector<float>& charges, float* posq,
std::vector<AlignedArray<float> >& threadForce, ThreadPool& threads,
Kernel& pmeKernel);
protected:
/**
* Precomputes values and updates pointers to data accessed by threads.
*
* @param boxVectors periodic box vectors
* @param posData particle positions
* @param posq wrapped single-precision position and charge array
* @param threadForce per-thread force accumulation arrays
*/
void setThreadData(const Vec3* boxVectors, const std::vector<Vec3>& posData, float* posq, std::vector<AlignedArray<float> >& threadForce);
/**
* Extracts electrode particle charges from posq and places them in the
* output charge array.
*
* @param charges output particle charges
*/
void saveCharges(std::vector<float>& charges);
/**
* Computes energies and forces for fixed (solved) charges.
*
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
* @param energy output system energy
*/
void getEnergyForces(ThreadPool& threads, Kernel& pmeKernel, double* energy);
/**
* Thread worker for getEnergyForcesThread().
*
* @param threads thread pool for parallel evaluation
* @param threadIndex index of the thread in the thread pool
* @param energy whether or not to compute system energy
*/
void getEnergyForcesThread(ThreadPool& threads, int threadIndex, bool includeEnergy);
/**
* Computes energy derivatives with respect to charges.
*
* @param threads thread pool for parallel evaluation
* @param pmeKernel CPU PME solver kernel
*/
void getDerivatives(ThreadPool& threads, Kernel& pmeKernel);
/**
* Thread worker for getDerivatives().
*
* @param threads thread pool for parallel evaluation
* @param threadIndex index of the thread in the thread pool
* @param plasmaTerm precomputed neutralizing plasma term for derivatives
*/
void getDerivativesThread(ThreadPool& threads, int threadIndex, float plasmaTerm);
/**
* Calculates a displacement between a pair of exception particles.
*
* @param posI the position of the first particle
* @param posJ the position of the second particle
* @param deltaR the displacement vector from particle I to particle J
* @param r2 the square magnitude of the displacement vector
*/
void getExceptionDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2) const;
/**
* Computes the direct space contribution to energies and forces for one
* block of particles.
*
* @param blockIndex the index of the block
* @param forces output forces
* @param energy output energy
*/
virtual void getEnergyForcesBlock(int blockIndex, float* forces, double* energy) = 0;
/**
* Computes the direct space contribution to charge derivatives for one
* block of particles.
*
* @param blockIndex the index of the block
* @param derivatives output derivatives
*/
virtual void getDerivativesBlock(int blockIndex, float* derivatives) = 0;
private:
/**
* Initializes energy and force lookup tables for a pair of electrodes.
*
* @param ie the index of the first electrode
* @param je the index of the second electrode
*/
void initializeLookupTables(int ie, int je);
protected:
static const int PotentialIndex;
static const int GaussianWidthIndex;
static const int ThomasFermiScaleIndex;
static const int NUM_TABLE_POINTS;
static const double TWO_OVER_SQRT_PI;
static const double SELF_ALPHA_SCALE;
static const double SELF_ETA_SCALE;
static const double SELF_TF_SCALE;
int numParticles, numElectrodeParticles, numElectrodes, posqIndex;
float nonbondedCutoff, ewaldAlpha, cgErrorTol;
int gridSize[3];
bool exceptionsArePeriodic, useChargeConstraint;
const CpuNeighborList* neighborList;
CpuConstantPotentialSolver* solver;
const std::set<int>* exclusions;
const int* sysToElec;
const int* elecToSys;
const int* sysElec;
const int* elecElec;
const std::array<double, 3>* electrodeParams;
std::vector<float> electrodePotentials;
std::vector<float> electrodeSelfScales;
float chargeTarget;
fvec4 externalField;
Vec3 boxVectors[3];
AlignedArray<fvec4> boxVectorsVec4;
fvec4 boxSize;
fvec4 recipBoxSize;
bool triclinic;
float plasmaScale;
const Vec3* posData;
float* posq;
std::vector<double> threadEnergy;
std::vector<AlignedArray<float> >* threadForce;
std::vector<float> chargeDerivatives;
std::vector<float> energyLookupTable;
std::vector<float> forceLookupTable;
float tableScale;
std::atomic<int> atomicBlockCounter, atomicParticleCounter;
};
class CpuConstantPotentialPmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
CpuConstantPotentialPmeIO(float* posq, float* force, float* chargeDerivatives, int numParticles, int numElectrodeParticles);
float* getPosq();
void setForce(float* f);
void setChargeDerivatives(float* derivatives);
private:
float* posq;
float* force;
float* chargeDerivatives;
int numParticles;
int numElectrodeParticles;
};
} // namespace OpenMM
#endif // OPENMM_CPUCONSTANTPOTENTIALFORCE_H_
#ifndef OPENMM_CPUCONSTANTPOTENTIALFORCEFVEC_H_
#define OPENMM_CPUCONSTANTPOTENTIALFORCEFVEC_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForce.h"
#include "SimTKOpenMMRealType.h"
namespace OpenMM {
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
template<typename FVEC, typename IVEC>
class CpuConstantPotentialForceFvec : public CpuConstantPotentialForce {
public:
CpuConstantPotentialForceFvec();
protected:
/**
* Computes the direct space contribution to energies and forces for one
* block of particles.
*
* @param blockIndex the index of the block
* @param forces output forces
* @param energy output energy
*/
void getEnergyForcesBlock(int blockIndex, float* forces, double* energy);
/**
* Computes the direct space contribution to charge derivatives for one
* block of particles.
*
* @param blockIndex the index of the block
* @param derivatives output derivatives
*/
void getDerivativesBlock(int blockIndex, float* derivatives);
/**
* Compute an approximation of a function using a table lookup, where an
* offset into the table can be provided for each value
*
* @param table start of tabulated values
* @param x values at which to evaluate the tabulated function
* @param tableOffsets offsets into the tabulated value array for each value
*/
FVEC tableLookup(float const * table, FVEC x, IVEC tableOffsets);
private:
static constexpr int blockSize = sizeof(FVEC) / sizeof(float);
template<PeriodicType PERIODIC_TYPE>
void getEnergyForcesBlockImpl(int blockIndex, float* forces, double* energy, fvec4& blockCenter);
template<PeriodicType PERIODIC_TYPE>
void getDerivativesBlockImpl(int blockIndex, float* derivatives, fvec4& blockCenter);
void getBlockPeriodicType(int blockIndex, PeriodicType& periodicType, fvec4& blockCenter);
template<PeriodicType PERIODIC_TYPE>
void getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2) const;
};
template<typename FVEC, typename IVEC>
CpuConstantPotentialForceFvec<FVEC, IVEC>::CpuConstantPotentialForceFvec() : CpuConstantPotentialForce() {
}
template<typename FVEC, typename IVEC>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getEnergyForcesBlock(int blockIndex, float* forces, double* energy) {
PeriodicType periodicType;
fvec4 blockCenter;
getBlockPeriodicType(blockIndex, periodicType, blockCenter);
if (periodicType == NoPeriodic) {
getEnergyForcesBlockImpl<NoPeriodic>(blockIndex, forces, energy, blockCenter);
}
else if (periodicType == PeriodicPerAtom) {
getEnergyForcesBlockImpl<PeriodicPerAtom>(blockIndex, forces, energy, blockCenter);
}
else if (periodicType == PeriodicPerInteraction) {
getEnergyForcesBlockImpl<PeriodicPerInteraction>(blockIndex, forces, energy, blockCenter);
}
else if (periodicType == PeriodicTriclinic) {
getEnergyForcesBlockImpl<PeriodicTriclinic>(blockIndex, forces, energy, blockCenter);
}
}
template<typename FVEC, typename IVEC>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getDerivativesBlock(int blockIndex, float* derivatives) {
PeriodicType periodicType;
fvec4 blockCenter;
getBlockPeriodicType(blockIndex, periodicType, blockCenter);
if (periodicType == NoPeriodic) {
getDerivativesBlockImpl<NoPeriodic>(blockIndex, derivatives, blockCenter);
}
else if (periodicType == PeriodicPerAtom) {
getDerivativesBlockImpl<PeriodicPerAtom>(blockIndex, derivatives, blockCenter);
}
else if (periodicType == PeriodicPerInteraction) {
getDerivativesBlockImpl<PeriodicPerInteraction>(blockIndex, derivatives, blockCenter);
}
else if (periodicType == PeriodicTriclinic) {
getDerivativesBlockImpl<PeriodicTriclinic>(blockIndex, derivatives, blockCenter);
}
}
template<typename FVEC, typename IVEC>
FVEC CpuConstantPotentialForceFvec<FVEC, IVEC>::tableLookup(float const * table, FVEC x, IVEC tableOffsets) {
FVEC offset = x * FVEC(tableScale);
FVEC index = min(floor(offset), (float)NUM_TABLE_POINTS);
FVEC s1, s2;
gatherVecPair(table, index + tableOffsets, s1, s2);
FVEC coeff = offset - index;
return (s1 - coeff * s1) + coeff * s2;
}
template<typename FVEC, typename IVEC>
template<PeriodicType PERIODIC_TYPE>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getEnergyForcesBlockImpl(int blockIndex, float* forces, double* energy, fvec4& blockCenter) {
const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
fvec4 blockAtomPosq[blockSize];
FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int k = 0; k < blockSize; k++) {
blockAtomPosq[k] = fvec4(posq + 4 * blockAtom[k]);
if (PERIODIC_TYPE == PeriodicPerAtom) {
blockAtomPosq[k] -= floor((blockAtomPosq[k] - blockCenter) * recipBoxSize + 0.5f) * boxSize;
}
}
transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
FVEC cutoffSquared(nonbondedCutoff * nonbondedCutoff);
// Get offsets into tables for block atoms.
int tableOffsetsArray[blockSize];
for (int k = 0; k < blockSize; k++) {
tableOffsetsArray[k] = (sysElec[blockAtom[k]] + 1) * (numElectrodes + 1) * (NUM_TABLE_POINTS + 4);
}
IVEC tableOffsets(tableOffsetsArray);
CpuNeighborList::NeighborIterator neighbors = neighborList->getNeighborIterator(blockIndex);
FVEC neighborsEnergy(0.0f);
while (neighbors.next()) {
int atom = neighbors.getNeighbor();
FVEC dx, dy, dz, r2;
fvec4 atomPos(posq + 4 * atom);
if (PERIODIC_TYPE == PeriodicPerAtom) {
atomPos -= floor((atomPos - blockCenter) * recipBoxSize + 0.5f) * boxSize;
}
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2);
auto include = FVEC::expandBitsToMask(~neighbors.getExclusions());
include = blendZero(r2 < cutoffSquared, include);
if (!any(include)) {
continue;
}
// Get offset into tables for single atom.
int jOffset = (sysElec[atom] + 1) * (NUM_TABLE_POINTS + 4);
FVEC inverseR = rsqrt(r2);
FVEC r = r2 * inverseR;
FVEC qqFactor = blockAtomCharge * posq[4 * atom + 3];
FVEC qqInverseR = qqFactor * inverseR;
FVEC forceFactor = qqInverseR * inverseR * inverseR * tableLookup(&forceLookupTable[jOffset], r, tableOffsets);
// Accumulate forces for block atoms.
forceFactor = blendZero(forceFactor, include);
FVEC forceX = dx * forceFactor;
FVEC forceY = dy * forceFactor;
FVEC forceZ = dz * forceFactor;
blockAtomForceX += forceX;
blockAtomForceY += forceY;
blockAtomForceZ += forceZ;
// Apply forces to single atom.
float* atomForce = forces + 4 * atom;
fvec4 newAtomForce = fvec4(atomForce) - reduceToVec3(forceX, forceY, forceZ);
newAtomForce.store(atomForce);
if (energy) {
neighborsEnergy += blendZero(qqInverseR * tableLookup(&energyLookupTable[jOffset], r, tableOffsets), include);
}
}
if (energy) {
*energy += reduceAdd(neighborsEnergy);
}
// Apply forces to block atoms.
fvec4 f[blockSize];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
for (int k = 0; k < blockSize; k++) {
(fvec4(forces + 4 * blockAtom[k]) + f[k]).store(forces + 4 * blockAtom[k]);
}
}
template<typename FVEC, typename IVEC>
template<PeriodicType PERIODIC_TYPE>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getDerivativesBlockImpl(int blockIndex, float* derivatives, fvec4& blockCenter) {
const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
fvec4 blockAtomPosq[blockSize];
FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
FVEC blockAtomDerivatives(0.0f);
for (int k = 0; k < blockSize; k++) {
blockAtomPosq[k] = fvec4(posq + 4 * blockAtom[k]);
if (PERIODIC_TYPE == PeriodicPerAtom) {
blockAtomPosq[k] -= floor((blockAtomPosq[k] - blockCenter) * recipBoxSize + 0.5f) * boxSize;
}
}
transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
FVEC cutoffSquared(nonbondedCutoff * nonbondedCutoff);
// Get offsets into tables for block atoms.
int iiArray[blockSize];
int tableOffsetsArray[blockSize];
for (int k = 0; k < blockSize; k++) {
iiArray[k] = sysToElec[blockAtom[k]];
tableOffsetsArray[k] = (sysElec[blockAtom[k]] + 1) * (numElectrodes + 1) * (NUM_TABLE_POINTS + 4);
}
IVEC ii(iiArray);
IVEC tableOffsets(tableOffsetsArray);
// Check if any block atoms are electrode atoms.
IVEC iiInclude = ii != IVEC(-1);
bool iiIncludeAny = any(iiInclude);
CpuNeighborList::NeighborIterator neighbors = neighborList->getNeighborIterator(blockIndex);
while (neighbors.next()) {
int atom = neighbors.getNeighbor();
FVEC dx, dy, dz, r2;
fvec4 atomPos(posq + 4 * atom);
if (PERIODIC_TYPE == PeriodicPerAtom) {
atomPos -= floor((atomPos - blockCenter) * recipBoxSize + 0.5f) * boxSize;
}
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2);
auto include = FVEC::expandBitsToMask(~neighbors.getExclusions());
include = blendZero(r2 < cutoffSquared, include);
if (!any(include)) {
continue;
}
// Derivatives must be evaluated if any block atoms are electrode atoms
// OR the single atom is an electrode atom.
int jj = sysToElec[atom];
bool jjInclude = jj != -1;
if (!(iiIncludeAny || jjInclude)) {
continue;
}
// Get offset into tables for single atom.
int jOffset = (sysElec[atom] + 1) * (NUM_TABLE_POINTS + 4);
FVEC inverseR = rsqrt(r2);
FVEC r = r2 * inverseR;
FVEC dq = blendZero(ONE_4PI_EPS0 * inverseR * tableLookup(&energyLookupTable[jOffset], r, tableOffsets), include);
// Accumulate derivatives for block atoms.
blockAtomDerivatives += blendZero(posq[4 * atom + 3] * dq, iiInclude);
// Apply derivatives to single atom.
if (jjInclude) {
derivatives[4 * jj] += reduceAdd(blockAtomCharge * dq);
}
}
// Apply derivatives to block atoms.
float blockAtomDerivativesArray[blockSize];
blockAtomDerivatives.store(blockAtomDerivativesArray);
for (int k = 0; k < blockSize; k++) {
derivatives[4 * iiArray[k]] += blockAtomDerivativesArray[k];
}
}
template<typename FVEC, typename IVEC>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getBlockPeriodicType(int blockIndex, PeriodicType& periodicType, fvec4& blockCenter) {
const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4 * blockAtom[0]];
miny = maxy = posq[4 * blockAtom[0] + 1];
minz = maxz = posq[4 * blockAtom[0] + 2];
for (int i = 1; i < blockSize; i++) {
minx = std::min(minx, posq[4 * blockAtom[i]]);
maxx = std::max(maxx, posq[4 * blockAtom[i]]);
miny = std::min(miny, posq[4 * blockAtom[i] + 1]);
maxy = std::max(maxy, posq[4 * blockAtom[i] + 1]);
minz = std::min(minz, posq[4 * blockAtom[i] + 2]);
maxz = std::max(maxz, posq[4 * blockAtom[i] + 2]);
}
blockCenter = fvec4(0.5f * (minx + maxx), 0.5f * (miny + maxy), 0.5f * (minz + maxz), 0.0f);
if (!(minx < nonbondedCutoff || miny < nonbondedCutoff || minz < nonbondedCutoff ||
maxx > boxSize[0] - nonbondedCutoff || maxy > boxSize[1] - nonbondedCutoff || maxz > boxSize[2] - nonbondedCutoff)) {
periodicType = NoPeriodic;
}
else if (triclinic) {
periodicType = PeriodicTriclinic;
}
else if (0.5f * (boxSize[0] - (maxx - minx)) >= nonbondedCutoff &&
0.5f * (boxSize[1] - (maxy - miny)) >= nonbondedCutoff &&
0.5f * (boxSize[2] - (maxz - minz)) >= nonbondedCutoff) {
periodicType = PeriodicPerAtom;
}
else {
periodicType = PeriodicPerInteraction;
}
}
template<typename FVEC, typename IVEC>
template<PeriodicType PERIODIC_TYPE>
void CpuConstantPotentialForceFvec<FVEC, IVEC>::getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2) const {
dx = x - posI[0];
dy = y - posI[1];
dz = z - posI[2];
if (PERIODIC_TYPE == PeriodicTriclinic) {
const auto scale3 = floor(dz * recipBoxSize[2] + 0.5f);
dx -= scale3 * boxVectorsVec4[2][0];
dy -= scale3 * boxVectorsVec4[2][1];
dz -= scale3 * boxVectorsVec4[2][2];
const auto scale2 = floor(dy * recipBoxSize[1] + 0.5f);
dx -= scale2 * boxVectorsVec4[1][0];
dy -= scale2 * boxVectorsVec4[1][1];
const auto scale1 = floor(dx * recipBoxSize[0] + 0.5f);
dx -= scale1 * boxVectorsVec4[0][0];
}
else if (PERIODIC_TYPE == PeriodicPerInteraction) {
dx -= round(dx * recipBoxSize[0]) * boxSize[0];
dy -= round(dy * recipBoxSize[1]) * boxSize[1];
dz -= round(dz * recipBoxSize[2]) * boxSize[2];
}
r2 = dx * dx + dy * dy + dz * dz;
}
} // namespace OpenMM
#endif // OPENMM_CPUCONSTANTPOTENTIALFORCEFVEC_H_
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2024 Stanford University and the Authors. * * Portions copyright (c) 2013-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CpuBondForce.h" #include "CpuBondForce.h"
#include "CpuConstantPotentialForce.h"
#include "CpuCustomGBForce.h" #include "CpuCustomGBForce.h"
#include "CpuCustomManyParticleForce.h" #include "CpuCustomManyParticleForce.h"
#include "CpuCustomNonbondedForce.h" #include "CpuCustomNonbondedForce.h"
...@@ -322,6 +323,83 @@ private: ...@@ -322,6 +323,83 @@ private:
CpuBondForce bondForce; CpuBondForce bondForce;
}; };
/**
* This kernel is invoked by ConstantPotentialForce to calculate the forces acting on the system.
*/
class CpuCalcConstantPotentialForceKernel : public CalcConstantPotentialForceKernel {
public:
CpuCalcConstantPotentialForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data);
~CpuCalcConstantPotentialForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the ConstantPotentialForce this kernel will be used for
*/
void initialize(const System& system, const ConstantPotentialForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the ConstantPotentialForce to copy the parameters from
* @param firstParticle the index of the first particle whose parameters might have changed
* @param lastParticle the index of the last particle whose parameters might have changed
* @param firstException the index of the first exception whose parameters might have changed
* @param lastException the index of the last exception whose parameters might have changed
* @param firstElectrode the index of the first electrode whose parameters might have changed
* @param lastElectrode the index of the last electrode whose parameters might have changed
*/
void copyParametersToContext(ContextImpl& context, const ConstantPotentialForce& force, int firstParticle, int lastParticle, int firstException, int lastException, int firstElectrode, int lastElectrode);
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the charges on all particles.
*
* @param context the context to copy parameters to
* @param[out] charges a vector to populate with particle charges
*/
void getCharges(ContextImpl& context, std::vector<double>& charges);
private:
void checkBoxSize(const Vec3* boxVectors);
void ensurePmeInitialized(ContextImpl& context);
private:
CpuPlatform::PlatformData& data;
int numParticles, num14, numElectrodeParticles, chargePosqIndex;
std::vector<double> setCharges;
std::vector<float> charges;
std::vector<std::vector<double> > bonded14ParamArray;
std::vector<std::vector<int> > bonded14IndexArray;
std::map<int, int> nb14Index;
std::vector<std::set<int> > exclusions;
std::vector<int> sysToElec, elecToSys, sysElec, elecElec;
std::vector<std::array<double, 3> > electrodeParams;
double nonbondedCutoff, ewaldAlpha, cgErrorTol, chargeTarget;
int gridSize[3];
bool exceptionsArePeriodic, hasInitializedPme, useChargeConstraint;
Vec3 externalField;
CpuConstantPotentialForce* constantPotential;
CpuConstantPotentialSolver* solver;
CpuBondForce bondForce;
Kernel pmeKernel;
};
/** /**
* This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system. * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
*/ */
......
...@@ -8,10 +8,14 @@ ENDFOREACH(file) ...@@ -8,10 +8,14 @@ ENDFOREACH(file)
IF(MSVC) IF(MSVC)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX2 /D__AVX2__") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX2 /D__AVX2__")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuConstantPotentialForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuConstantPotentialForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX2 /D__AVX2__")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
ELSEIF(X86) ELSEIF(X86)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx2 -mfma") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx2 -mfma")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuConstantPotentialForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuConstantPotentialForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx2 -mfma")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx") SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
ENDIF() ENDIF()
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForce.h"
#include "SimTKOpenMMRealType.h"
#include <algorithm>
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
#include "openmm/internal/MSVC_erfc.h"
using namespace std;
using namespace OpenMM;
CpuConstantPotentialSolver::CpuConstantPotentialSolver(int numParticles, int numElectrodeParticles) :
numParticles(numParticles),
numElectrodeParticles(numElectrodeParticles),
valid(false),
hasSavedSolution(false),
savedPositions(numParticles),
savedCharges(numElectrodeParticles)
{
}
CpuConstantPotentialSolver::~CpuConstantPotentialSolver() {
}
void CpuConstantPotentialSolver::invalidate() {
valid = false;
hasSavedSolution = false;
}
void CpuConstantPotentialSolver::discardSavedSolution() {
hasSavedSolution = false;
}
void CpuConstantPotentialSolver::solve(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) {
// There is nothing to do if all particles have fixed charges.
if(!numElectrodeParticles) {
return;
}
// If box vectors or positions have not changed, and there is a solution
// already computed, we can simply reload it instead of solving again.
if (hasSavedSolution) {
if (savedBoxVectors[0] != conp.boxVectors[0] || savedBoxVectors[1] != conp.boxVectors[1] || savedBoxVectors[2] != conp.boxVectors[2]) {
hasSavedSolution = false;
}
}
if (hasSavedSolution) {
for (int i = 0; i < numParticles; i++) {
if (savedPositions[i] != conp.posData[i]) {
hasSavedSolution = false;
break;
}
}
}
if (hasSavedSolution) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = savedCharges[ii];
}
return;
}
solveImpl(conp, threads, pmeKernel);
hasSavedSolution = true;
savedBoxVectors[0] = conp.boxVectors[0];
savedBoxVectors[1] = conp.boxVectors[1];
savedBoxVectors[2] = conp.boxVectors[2];
savedPositions.assign(static_cast<const Vec3*>(conp.posData), static_cast<const Vec3*>(conp.posData + numParticles));
for (int ii = 0; ii < numElectrodeParticles; ii++) {
savedCharges[ii] = conp.posq[4 * conp.elecToSys[ii] + 3];
}
}
CpuConstantPotentialMatrixSolver::CpuConstantPotentialMatrixSolver(int numParticles, int numElectrodeParticles) : CpuConstantPotentialSolver(numParticles, numElectrodeParticles),
electrodePosData(numElectrodeParticles), constraintVector(numElectrodeParticles), b(numElectrodeParticles) {
}
void CpuConstantPotentialMatrixSolver::solveImpl(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) {
ensureValid(conp, threads, pmeKernel);
// Zero electrode charges and get derivatives at zero charge.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = 0.0f;
}
conp.getDerivatives(threads, pmeKernel);
for (int ii = 0; ii < numElectrodeParticles; ii++) {
b[ii] = -conp.chargeDerivatives[ii];
}
// Solve for electrode charges directly using capacitance matrix and
// calculated derivatives.
TNT::Array1D<float> q = capacitance.solve(b);
// Enforce total charge constraint if requested.
if (conp.useChargeConstraint) {
float chargeOffset = conp.chargeTarget;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
chargeOffset -= q[ii];
}
for (int ii = 0; ii < numElectrodeParticles; ii++) {
q[ii] += chargeOffset * constraintVector[ii];
}
}
// Copy solution out of solver memory into posq.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = q[ii];
}
}
void CpuConstantPotentialMatrixSolver::ensureValid(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) {
// Initializes or updates the precomputed capacitance matrix if this is its
// first use or electrode parameters have changed since its initialization.
// In this method, we modify the current electrode charges in posq. This
// is fine since it will only get called at the start of solve(), which
// overwrites the electrode charges in posq after solving for them.
// Check for changes to box vectors or electrode positions that might
// invalidate a matrix that is currently marked valid.
if (valid) {
if (boxVectors[0] != conp.boxVectors[0] || boxVectors[1] != conp.boxVectors[1] || boxVectors[2] != conp.boxVectors[2]) {
valid = false;
}
}
if (valid) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
if (electrodePosData[ii] != conp.posData[conp.elecToSys[ii]]) {
valid = false;
break;
}
}
}
if (valid) {
return;
}
// Store the current box vectors and electrode positions before updating the
// capacitance matrix.
valid = true;
boxVectors[0] = conp.boxVectors[0];
boxVectors[1] = conp.boxVectors[1];
boxVectors[2] = conp.boxVectors[2];
for (int ii = 0; ii < numElectrodeParticles; ii++) {
electrodePosData[ii] = conp.posData[conp.elecToSys[ii]];
}
TNT::Array2D<float> A(numElectrodeParticles, numElectrodeParticles);
vector<float> dUdQ0(numElectrodeParticles);
vector<float> dUdQ(numElectrodeParticles);
// Get derivatives when all electrode charges are zeroed.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = 0.0f;
}
conp.getDerivatives(threads, pmeKernel);
dUdQ0 = conp.chargeDerivatives;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
int i = conp.elecToSys[ii];
// Get derivatives when one electrode charge is set.
conp.posq[4 * i + 3] = 1.0f;
conp.getDerivatives(threads, pmeKernel);
dUdQ = conp.chargeDerivatives;
conp.posq[4 * i + 3] = 0.0f;
// Set matrix elements, subtracting zero charge derivatives so that the
// matrix will end up being the (charge-independent) Hessian.
for (int jj = 0; jj < ii; jj++) {
A[ii][jj] = A[jj][ii] = dUdQ[jj] - dUdQ0[jj];
}
A[ii][ii] = dUdQ[ii] - dUdQ0[ii];
}
// Compute Cholesky decomposition representation of the inverse.
capacitance = JAMA::Cholesky<float>(A);
if (!capacitance.is_spd()) {
throw OpenMMException("Electrode matrix not positive definite");
}
// Precompute the appropriate scaling vector to enforce constant total
// charge if requested. The vector is parallel to one obtained by solving
// Aq = b for all q_i = 1 (ensuring the constrained charges will actually be
// the correct constrained minimum of the quadratic form for the energy),
// and is scaled so that adding it to a vector of charges increases the
// total charge by 1 (making it easy to calculate the necessary offset).
if (conp.useChargeConstraint) {
TNT::Array1D<float> solution = capacitance.solve(TNT::Array1D<float>(numElectrodeParticles, 1.0));
constraintVector.assign(static_cast<float*>(solution), static_cast<float*>(solution) + numElectrodeParticles);
float constraintScaleInv = 0.0;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
constraintScaleInv += constraintVector[ii];
}
float constraintScale = 1.0 / constraintScaleInv;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
constraintVector[ii] *= constraintScale;
}
}
}
CpuConstantPotentialCGSolver::CpuConstantPotentialCGSolver(int numParticles,int numElectrodeParticles, bool precond) : CpuConstantPotentialSolver(numParticles, numElectrodeParticles),
precondRequested(precond),
precondVector(numElectrodeParticles),
q(numElectrodeParticles),
grad(numElectrodeParticles),
projGrad(numElectrodeParticles),
precGrad(numElectrodeParticles),
qStep(numElectrodeParticles),
gradStep(numElectrodeParticles),
grad0(numElectrodeParticles),
qLast(numElectrodeParticles)
{
}
void CpuConstantPotentialCGSolver::solveImpl(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) {
ensureValid(conp, threads, pmeKernel);
double offset;
float error, paramScale, alpha, beta;
const float errorTarget = conp.cgErrorTol * conp.cgErrorTol * numElectrodeParticles;
// Set initial guess charges as linear extrapolations from the current and
// previous charges fed through the solver, and save the current charges as
// the previous charges.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
int i = conp.elecToSys[ii];
float qGuess = conp.posq[4 * i + 3];
conp.posq[4 * i + 3] = 2.0f * qGuess - qLast[ii];
qLast[ii] = qGuess;
}
// Ensure that initial guess charges satisfy the constraint.
if (conp.useChargeConstraint) {
offset = conp.chargeTarget;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset -= conp.posq[4 * conp.elecToSys[ii] + 3];
}
offset /= numElectrodeParticles;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] += offset;
}
}
// Evaluate the initial gradient Aq - b.
conp.getDerivatives(threads, pmeKernel);
grad.assign(&conp.chargeDerivatives[0], &conp.chargeDerivatives[numElectrodeParticles]);
// Project the initial gradient without preconditioning.
offset = 0.0;
if (conp.useChargeConstraint) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset += grad[ii];
}
offset /= numElectrodeParticles;
}
for (int ii = 0; ii < numElectrodeParticles; ii++) {
projGrad[ii] = grad[ii] - offset;
}
// Check for convergence at the initial guess charges.
error = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
error += projGrad[ii] * projGrad[ii];
}
if (error <= errorTarget) {
return;
}
// Save the current charges, then evaluate the gradient with zero
// charges (-b) so that we can later compute Ap as (Ap - b) - (-b).
for (int ii = 0; ii < numElectrodeParticles; ii++) {
int i = conp.elecToSys[ii];
q[ii] = conp.posq[4 * i + 3];
conp.posq[4 * i + 3] = 0.0f;
}
conp.getDerivatives(threads, pmeKernel);
grad0.assign(&conp.chargeDerivatives[0], &conp.chargeDerivatives[numElectrodeParticles]);
// Project the initial gradient with preconditioning.
if (precondActivated) {
offset = 0.0;
if (conp.useChargeConstraint) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset += precondVector[ii] * grad[ii];
}
}
for (int ii = 0; ii < numElectrodeParticles; ii++) {
precGrad[ii] = precondVector[ii] * (grad[ii] - offset);
}
}
else {
precGrad.assign(projGrad.begin(), projGrad.end());
}
// Initialize step vector for conjugate gradient iterations.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
qStep[ii] = -precGrad[ii];
}
// Perform conjugate gradient iterations.
bool converged = false;
for (int iter = 0; iter <= numElectrodeParticles; iter++) {
// Evaluate the matrix-vector product A qStep.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = qStep[ii];
}
conp.getDerivatives(threads, pmeKernel);
gradStep.assign(&conp.chargeDerivatives[0], &conp.chargeDerivatives[numElectrodeParticles]);
for (int ii = 0; ii < numElectrodeParticles; ii++) {
gradStep[ii] -= grad0[ii];
}
// If A qStep is small enough, stop to prevent, e.g., division by
// zero in the calculation of alpha, or too large step sizes.
error = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
error += gradStep[ii] * gradStep[ii];
}
if (error <= errorTarget) {
converged = true;
break;
}
// Evaluate the scalar 1 / (qStep^T A qStep).
paramScale = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
paramScale += qStep[ii] * gradStep[ii];
}
paramScale = 1.0f / paramScale;
// Evaluate the conjugate gradient parameter alpha.
alpha = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
alpha -= qStep[ii] * grad[ii];
}
alpha *= paramScale;
// Update the charge vector.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
q[ii] += alpha * qStep[ii];
}
if (conp.useChargeConstraint) {
// Remove any accumulated drift from the charge vector. This
// would be zero in exact arithmetic, but error can accumulate
// over time in finite precision.
offset = conp.chargeTarget;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset -= q[ii];
}
offset /= numElectrodeParticles;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
q[ii] += offset;
}
}
// Update the gradient vector (but periodically recompute it instead
// of updating to reduce the accumulation of roundoff error).
if (iter != 0 && iter % 32 == 0) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = q[ii];
}
conp.getDerivatives(threads, pmeKernel);
grad.assign(&conp.chargeDerivatives[0], &conp.chargeDerivatives[numElectrodeParticles]);
}
else {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
grad[ii] += alpha * gradStep[ii];
}
}
// Project the current gradient without preconditioning.
offset = 0.0;
if (conp.useChargeConstraint) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset += grad[ii];
}
offset /= numElectrodeParticles;
}
for (int ii = 0; ii < numElectrodeParticles; ii++) {
projGrad[ii] = grad[ii] - offset;
}
// Check for convergence.
error = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
error += projGrad[ii] * projGrad[ii];
}
if (error <= errorTarget) {
converged = true;
break;
}
// Project the current gradient with preconditioning.
if (precondActivated) {
offset = 0.0;
if (conp.useChargeConstraint) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset += precondVector[ii] * grad[ii];
}
}
for (int ii = 0; ii < numElectrodeParticles; ii++) {
precGrad[ii] = precondVector[ii] * (grad[ii] - offset);
}
}
else {
precGrad.assign(projGrad.begin(), projGrad.end());
}
// Evaluate the conjugate gradient parameter beta.
beta = 0.0f;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
beta += precGrad[ii] * gradStep[ii];
}
beta *= paramScale;
// Update the step vector.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
qStep[ii] = beta * qStep[ii] - precGrad[ii];
}
if (conp.useChargeConstraint) {
// Project out any deviation off of the constraint plane from
// the step vector. This would be zero in exact arithmetic, but
// error can accumulate over time in finite precision.
offset = 0.0;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
offset += qStep[ii];
}
offset /= numElectrodeParticles;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
qStep[ii] -= offset;
}
}
}
if (!converged) {
throw OpenMMException("Constant potential conjugate gradient iterations not converged");
}
// Store the final charges in posq.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
conp.posq[4 * conp.elecToSys[ii] + 3] = q[ii];
}
}
void CpuConstantPotentialCGSolver::ensureValid(CpuConstantPotentialForce& conp, ThreadPool& threads, Kernel& pmeKernel) {
// Initializes or updates information for a preconditioner for the conjugate
// gradient method if this is its first use or electrode parameters have
// changed since its initialization.
// No action is required if the box vectors have not changed.
if (valid && boxVectors[0] == conp.boxVectors[0] && boxVectors[1] == conp.boxVectors[1] && boxVectors[2] == conp.boxVectors[2]) {
return;
}
valid = true;
boxVectors[0] = conp.boxVectors[0];
boxVectors[1] = conp.boxVectors[1];
boxVectors[2] = conp.boxVectors[2];
precondActivated = false;
if (precondRequested && numElectrodeParticles) {
// If electrode self-energy contributions differ between electrodes,
// a preconditioner may help convergence; otherwise, it provides no
// benefit and may slow convergence due to roundoff error.
for (int ie = 1; ie < conp.numElectrodes; ie++) {
// Note electrodeSelfScales[1] has the scale for electrode 0, etc.
if (conp.electrodeSelfScales[ie + 1] != conp.electrodeSelfScales[ie]) {
precondActivated = true;
break;
}
}
}
if (precondActivated) {
// Save the position of the first electrode particle.
int i0 = conp.elecToSys[0];
float x0 = conp.posq[4 * i0];
float y0 = conp.posq[4 * i0 + 1];
float z0 = conp.posq[4 * i0 + 2];
// Save the charges of all particles, and then zero them.
vector<float> qSave(numParticles);
for (int i = 0; i < numParticles; i++) {
qSave[i] = conp.posq[4 * i + 3];
conp.posq[4 * i + 3] = 0.0f;
}
// Place a unit charge at the origin.
conp.posq[4 * i0] = 0.0f;
conp.posq[4 * i0 + 1] = 0.0f;
conp.posq[4 * i0 + 2] = 0.0f;
conp.posq[4 * i0 + 3] = 1.0f;
// Perform a reference PME calculation with a single charge at the
// origin to find the constant offset on the preconditioner diagonal due
// to the PME calculation. This will actually vary slightly with
// position but only due to finite accuracy of the PME splines, so it is
// fine to assume it will be constant for the preconditioner.
vector<float> derivatives(numElectrodeParticles);
CpuConstantPotentialPmeIO io(conp.posq, NULL, &derivatives[0], numParticles, numElectrodeParticles);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, false, false, true);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
float pmeTerm = derivatives[0];
// Restore particle positions and charges.
conp.posq[4 * i0] = x0;
conp.posq[4 * i0 + 1] = y0;
conp.posq[4 * i0 + 2] = z0;
for (int i = 0; i < numParticles; i++) {
conp.posq[4 * i + 3] = qSave[i];
}
// The diagonal has a contribution from reciprocal space, Ewald
// self-interaction, Ewald neutralizing plasma, Gaussian self-interaction,
// and Thomas-Fermi contributions.
double precondScaleInv = 0.0;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
precondVector[ii] = 1.0 / (2.0 * (conp.electrodeSelfScales[conp.elecElec[ii] + 1] - conp.plasmaScale) + pmeTerm);
precondScaleInv += precondVector[ii];
}
double precondScale = 1.0 / precondScaleInv;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
precondVector[ii] *= precondScale;
}
}
}
const int CpuConstantPotentialForce::PotentialIndex = 0;
const int CpuConstantPotentialForce::GaussianWidthIndex = 1;
const int CpuConstantPotentialForce::ThomasFermiScaleIndex = 2;
const int CpuConstantPotentialForce::NUM_TABLE_POINTS = 2048;
const double CpuConstantPotentialForce::TWO_OVER_SQRT_PI = 2.0 / sqrt(PI_M);
const double CpuConstantPotentialForce::SELF_ALPHA_SCALE = ONE_4PI_EPS0 / sqrt(PI_M);
const double CpuConstantPotentialForce::SELF_ETA_SCALE = ONE_4PI_EPS0 / sqrt(2.0 * PI_M);
const double CpuConstantPotentialForce::SELF_TF_SCALE = 1.0 / (2.0 * EPSILON0);
CpuConstantPotentialForce::CpuConstantPotentialForce() {
}
CpuConstantPotentialForce::~CpuConstantPotentialForce() {
}
void CpuConstantPotentialForce::initialize(
int numParticles,
int numElectrodeParticles,
int posqIndex,
float nonbondedCutoff,
float ewaldAlpha,
float cgErrorTol,
const int* gridSize,
bool exceptionsArePeriodic,
bool useChargeConstraint,
const CpuNeighborList& neighborList,
CpuConstantPotentialSolver* solver,
const vector<set<int> >& exclusions,
const vector<int>& sysToElec,
const vector<int>& elecToSys,
const vector<int>& sysElec,
const vector<int>& elecElec,
const vector<array<double, 3> >& electrodeParams,
float chargeTarget,
const float* externalField
) {
this->numParticles = numParticles;
this->numElectrodeParticles = numElectrodeParticles;
this->posqIndex = posqIndex;
this->nonbondedCutoff = nonbondedCutoff;
this->ewaldAlpha = ewaldAlpha;
this->cgErrorTol = cgErrorTol;
this->gridSize[0] = gridSize[0];
this->gridSize[1] = gridSize[1];
this->gridSize[2] = gridSize[2];
this->exceptionsArePeriodic = exceptionsArePeriodic;
this->useChargeConstraint = useChargeConstraint;
this->neighborList = &neighborList;
this->solver = solver;
// We store pointers to exclusions, sysToElec, elecToSys, sysElec, and
// elecElec, which should not change, and to electrodeParams, whose values
// might be updated. If this happens, update() will be called with the
// electrode indices that have new parameters.
this->exclusions = &exclusions[0];
this->sysToElec = &sysToElec[0];
this->elecToSys = &elecToSys[0];
this->sysElec = &sysElec[0];
this->elecElec = &elecElec[0];
this->electrodeParams = &electrodeParams[0];
numElectrodes = electrodeParams.size();
electrodePotentials.resize(numElectrodes + 1);
electrodeSelfScales.resize(numElectrodes + 1);
for (int ie = -1; ie < numElectrodes; ie++) {
// Electrode indices of particles range from -1 to numElectrodes - 1,
// with -1 indicating a non-electrode particle. Precompute electrode
// parameters, storing values for [-1, numElectrodes - 1] as [0,
// numElectrodes] for convenient lookup.
double electrodePotential = 0.0;
double electrodeSelfScale = -SELF_ALPHA_SCALE * ewaldAlpha;
if (ie != -1) {
electrodePotential = electrodeParams[ie][PotentialIndex];
electrodeSelfScale += SELF_ETA_SCALE / electrodeParams[ie][GaussianWidthIndex] + SELF_TF_SCALE * electrodeParams[ie][ThomasFermiScaleIndex];
}
electrodePotentials[ie + 1] = (float) electrodePotential;
electrodeSelfScales[ie + 1] = (float) electrodeSelfScale;
}
this->chargeTarget = chargeTarget;
this->externalField = fvec4(externalField[0], externalField[1], externalField[2], 0.0f);
chargeDerivatives.resize(numElectrodeParticles);
energyLookupTable.resize((numElectrodes + 1) * (numElectrodes + 1) * (NUM_TABLE_POINTS + 4));
forceLookupTable.resize((numElectrodes + 1) * (numElectrodes + 1) * (NUM_TABLE_POINTS + 4));
tableScale = NUM_TABLE_POINTS / nonbondedCutoff;
for (int ie = -1; ie < numElectrodes; ie++) {
for (int je = -1; je <= ie; je++) {
initializeLookupTables(ie, je);
}
}
}
void CpuConstantPotentialForce::update(float chargeTarget, const float* externalField, int firstElectrode, int lastElectrode) {
solver->discardSavedSolution();
if (firstElectrode <= lastElectrode) {
solver->invalidate();
}
for (int ie = firstElectrode; ie <= lastElectrode; ie++) {
electrodePotentials[ie + 1] = (float) electrodeParams[ie][PotentialIndex];
electrodeSelfScales[ie + 1] = (float) (-SELF_ALPHA_SCALE * ewaldAlpha + SELF_ETA_SCALE / electrodeParams[ie][GaussianWidthIndex] + SELF_TF_SCALE * electrodeParams[ie][ThomasFermiScaleIndex]);
}
this->chargeTarget = chargeTarget;
this->externalField = fvec4(externalField[0], externalField[1], externalField[2], 0.0f);
for (int ie = firstElectrode; ie <= lastElectrode; ie++) {
for (int je = -1; je < numElectrodes; je++) {
if (je >= firstElectrode && je <= lastElectrode && je > ie) {
continue;
}
initializeLookupTables(ie, je);
}
}
}
void CpuConstantPotentialForce::execute(
const Vec3* boxVectors,
const vector<Vec3>& posData,
vector<float>& charges,
float* posq,
vector<AlignedArray<float> >& threadForce,
double* energy,
ThreadPool& threads,
Kernel& pmeKernel
) {
setThreadData(boxVectors, posData, posq, threadForce);
if (numElectrodeParticles) {
solver->solve(*this, threads, pmeKernel);
}
getEnergyForces(threads, pmeKernel, energy);
saveCharges(charges);
}
void CpuConstantPotentialForce::getCharges(
const Vec3* boxVectors,
const vector<Vec3>& posData,
vector<float>& charges,
float* posq,
vector<AlignedArray<float> >& threadForce,
ThreadPool& threads,
Kernel& pmeKernel
) {
setThreadData(boxVectors, posData, posq, threadForce);
if (numElectrodeParticles) {
solver->solve(*this, threads, pmeKernel);
}
saveCharges(charges);
}
void CpuConstantPotentialForce::setThreadData(const Vec3* boxVectors, const vector<Vec3>& posData, float* posq, vector<AlignedArray<float> >& threadForce) {
this->boxVectors[0] = boxVectors[0];
this->boxVectors[1] = boxVectors[1];
this->boxVectors[2] = boxVectors[2];
boxVectorsVec4.resize(3);
boxVectorsVec4[0] = fvec4((float) boxVectors[0][0], (float) boxVectors[0][1], (float) boxVectors[0][2], 0.0f);
boxVectorsVec4[1] = fvec4((float) boxVectors[1][0], (float) boxVectors[1][1], (float) boxVectors[1][2], 0.0f);
boxVectorsVec4[2] = fvec4((float) boxVectors[2][0], (float) boxVectors[2][1], (float) boxVectors[2][2], 0.0f);
boxSize = fvec4(boxVectorsVec4[0][0], boxVectorsVec4[1][1], boxVectorsVec4[2][2], 0.0f);
recipBoxSize = fvec4((float) (1.0 / boxVectors[0][0]), (float) (1.0 / boxVectors[1][1]), (float) (1.0 / boxVectors[2][2]), 0.0f);
triclinic = boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0;
plasmaScale = (float) (1.0 / (8.0 * EPSILON0 * boxVectors[0][0] * boxVectors[1][1] * boxVectors[2][2] * ewaldAlpha * ewaldAlpha));
this->posData = &posData[0];
this->posq = posq;
this->threadForce = &threadForce;
}
void CpuConstantPotentialForce::saveCharges(vector<float>& charges) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
int i = elecToSys[ii];
charges[i] = posq[4 * i + 3];
}
}
void CpuConstantPotentialForce::getEnergyForces(ThreadPool& threads, Kernel& pmeKernel, double* energy) {
// Everything except reciprocal space.
int numThreads = threads.getNumThreads();
threadEnergy.resize(numThreads);
atomicBlockCounter = 0;
atomicParticleCounter = 0;
threads.execute([&] (ThreadPool& threads, int threadIndex) { getEnergyForcesThread(threads, threadIndex, energy != NULL); });
threads.waitForThreads();
// Accumulate thread energies.
if (energy != NULL) {
double energyAccum = 0.0;
for (int i = 0; i < numThreads; i++) {
energyAccum += threadEnergy[i];
}
*energy += energyAccum;
}
// Reciprocal space.
CpuConstantPotentialPmeIO io(posq, &(*threadForce)[0][0], NULL, numParticles, numElectrodeParticles);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, energy != NULL, true, false);
double pmeEnergy = pmeKernel.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
if (energy != NULL) {
// Ewald neutralizing plasma.
float qTotal = 0.0;
for (int i = 0; i < numParticles; i++) {
qTotal += posq[4 * i + 3];
}
pmeEnergy -= plasmaScale * qTotal * qTotal;
// Accumulate energy.
*energy += pmeEnergy;
}
}
void CpuConstantPotentialForce::getEnergyForcesThread(ThreadPool& threads, int threadIndex, bool includeEnergy) {
int numThreads = threads.getNumThreads();
int groupSize = max(1, numParticles / (10 * numThreads));
float* forces = &(*threadForce)[threadIndex][0];
if (includeEnergy) {
threadEnergy[threadIndex] = 0.0;
}
// Direct space.
while (true) {
int nextBlock = atomicBlockCounter++;
if (nextBlock >= neighborList->getNumBlocks()) {
break;
}
getEnergyForcesBlock(nextBlock, forces, includeEnergy ? &threadEnergy[threadIndex] : NULL);
}
while (true) {
int start = atomicParticleCounter.fetch_add(groupSize);
if (start >= numParticles) {
break;
}
int end = min(start + groupSize, numParticles);
for (int i = start; i < end; i++) {
fvec4 posI((float) posData[i][0], (float) posData[i][1], (float) posData[i][2], 0.0f);
float chargeI = posq[4 * i + 3];
float chargeIScaled = ONE_4PI_EPS0 * chargeI;
// Exceptions.
for (int j : exclusions[i]) {
if (j <= i) {
continue;
}
fvec4 posJ((float) posData[j][0], (float) posData[j][1], (float) posData[j][2], 0.0f);
float chargeJ = posq[4 * j + 3];
fvec4 deltaR;
float r2;
getExceptionDeltaR(posI, posJ, deltaR, r2);
float r = sqrt(r2);
float inverseR = 1.0f / r;
float qqFactor = chargeIScaled * chargeJ;
float alphaR = ewaldAlpha * r;
if (alphaR > 1e-6) {
float erfAlphaR = erf(alphaR);
float qqFactorInverseR = qqFactor * inverseR;
float forceFactor = qqFactorInverseR * inverseR * inverseR * (erfAlphaR - (float) TWO_OVER_SQRT_PI * alphaR * exp(-alphaR * alphaR));
fvec4 force = forceFactor * deltaR;
(fvec4(forces + 4 * i) + force).store(forces + 4 * i);
(fvec4(forces + 4 * j) - force).store(forces + 4 * j);
if (includeEnergy) {
threadEnergy[threadIndex] -= qqFactorInverseR * erfAlphaR;
}
}
else if (includeEnergy) {
threadEnergy[threadIndex] -= qqFactor * (float) TWO_OVER_SQRT_PI * ewaldAlpha;
}
}
if (includeEnergy) {
// Ewald self-interaction and external field contributions (all
// particles), along with Gaussian self-interaction, potential,
// and Thomas-Fermi contributions (electrode particles only).
int iePlus1 = sysElec[i] + 1;
threadEnergy[threadIndex] += chargeI * (chargeI * electrodeSelfScales[iePlus1] - electrodePotentials[iePlus1] - dot3(posI, externalField));
}
// External field force.
(fvec4(forces + 4 * i) + chargeI * externalField).store(forces + 4 * i);
}
}
}
void CpuConstantPotentialForce::getDerivatives(ThreadPool& threads, Kernel& pmeKernel) {
// Ewald neutralizing plasma.
float qTotal = 0.0f;
for (int i = 0; i < numParticles; i++) {
qTotal += posq[4 * i + 3];
}
float plasmaTerm = -2.0f * plasmaScale * qTotal;
// Everything except reciprocal space (but including neutralizing plasma).
int numThreads = threads.getNumThreads();
atomicBlockCounter = 0;
threads.execute([&] (ThreadPool& threads, int threadIndex) { getDerivativesThread(threads, threadIndex, plasmaTerm); });
threads.waitForThreads();
// Accumulate thread derivatives.
for (int ii = 0; ii < numElectrodeParticles; ii++) {
chargeDerivatives[ii] = 0.0f;
for (int j = 0; j < numThreads; j++) {
chargeDerivatives[ii] += (*threadForce)[j][4 * ii + 3];
}
}
// Reciprocal space.
CpuConstantPotentialPmeIO io(posq, NULL, &chargeDerivatives[0], numParticles, numElectrodeParticles);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, false, false, true);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
void CpuConstantPotentialForce::getDerivativesThread(
ThreadPool& threads,
int threadIndex,
float plasmaTerm
) {
int numThreads = threads.getNumThreads();
// Zero derivatives for all electrode particles for this thread. Use the
// fourth component of the threadForce array for the derivatives.
float* derivatives = &(*threadForce)[threadIndex][3];
for (int ii = 0; ii < numElectrodeParticles; ii++) {
derivatives[4 * ii] = 0.0f;
}
// Direct space.
while (true) {
int nextBlock = atomicBlockCounter++;
if (nextBlock >= neighborList->getNumBlocks()) {
break;
}
getDerivativesBlock(nextBlock, derivatives);
}
int start = threadIndex * numElectrodeParticles / numThreads;
int end = (threadIndex + 1) * numElectrodeParticles / numThreads;
// Ewald self-interaction and external field contributions, along with
// Gaussian self-interaction, potential, and Thomas-Fermi contributions.
for (int ii = start; ii < end; ii++) {
int i = elecToSys[ii];
int iePlus1 = elecElec[ii] + 1;
fvec4 posI((float) posData[i][0], (float) posData[i][1], (float) posData[i][2], 0.0f);
derivatives[4 * ii] += 2.0f * posq[4 * i + 3] * electrodeSelfScales[iePlus1] - electrodePotentials[iePlus1] - dot3(posI, externalField) + plasmaTerm;
}
}
void CpuConstantPotentialForce::getExceptionDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2) const {
deltaR = posJ - posI;
if (exceptionsArePeriodic) {
if (triclinic) {
deltaR -= boxVectorsVec4[2] * floorf(deltaR[2] * recipBoxSize[2] + 0.5f);
deltaR -= boxVectorsVec4[1] * floorf(deltaR[1] * recipBoxSize[1] + 0.5f);
deltaR -= boxVectorsVec4[0] * floorf(deltaR[0] * recipBoxSize[0] + 0.5f);
}
else {
fvec4 base = round(deltaR * recipBoxSize) * boxSize;
deltaR = deltaR - base;
}
}
r2 = dot3(deltaR, deltaR);
}
void CpuConstantPotentialForce::initializeLookupTables(int ie, int je) {
float * energyTableIJ = &energyLookupTable[(((ie + 1) * (numElectrodes + 1)) + je + 1) * (NUM_TABLE_POINTS + 4)];
float * forceTableIJ = &forceLookupTable[(((ie + 1) * (numElectrodes + 1)) + je + 1) * (NUM_TABLE_POINTS + 4)];
bool ieIsElectrode = ie != -1;
bool jeIsElectrode = je != -1;
bool involvesElectrode = ieIsElectrode || jeIsElectrode;
double iWidth = ieIsElectrode ? electrodeParams[ie][GaussianWidthIndex] : 0.0;
double jWidth = jeIsElectrode ? electrodeParams[je][GaussianWidthIndex] : 0.0;
double eta = involvesElectrode ? 1.0 / sqrt(iWidth * iWidth + jWidth * jWidth) : 0.0;
for (int iTable = 0; iTable < NUM_TABLE_POINTS + 4; iTable++) {
double r = (nonbondedCutoff * iTable) / NUM_TABLE_POINTS;
double alphaR = ewaldAlpha * r;
double etaR = eta * r;
energyTableIJ[iTable] = erfc(alphaR) - (involvesElectrode ? erfc(etaR) : 0.0);
forceTableIJ[iTable] = energyTableIJ[iTable] + TWO_OVER_SQRT_PI * (alphaR * exp(-alphaR * alphaR) - (involvesElectrode ? etaR * exp(-etaR * etaR) : 0.0));
}
if (ie != je) {
float * energyTableJI = &energyLookupTable[(((je + 1) * (numElectrodes + 1)) + ie + 1) * (NUM_TABLE_POINTS + 4)];
float * forceTableJI = &forceLookupTable[(((je + 1) * (numElectrodes + 1)) + ie + 1) * (NUM_TABLE_POINTS + 4)];
for (int iTable = 0; iTable < NUM_TABLE_POINTS + 4; iTable++) {
energyTableJI[iTable] = energyTableIJ[iTable];
forceTableJI[iTable] = forceTableIJ[iTable];
}
}
}
CpuConstantPotentialPmeIO::CpuConstantPotentialPmeIO(float* posq, float* force, float* chargeDerivatives, int numParticles, int numElectrodeParticles) :
posq(posq), force(force), chargeDerivatives(chargeDerivatives), numParticles(numParticles), numElectrodeParticles(numElectrodeParticles) {
}
float* CpuConstantPotentialPmeIO::getPosq() {
return posq;
}
void CpuConstantPotentialPmeIO::setForce(float* f) {
for (int i = 0; i < numParticles; i++) {
force[4 * i] += f[4 * i];
force[4 * i + 1] += f[4 * i + 1];
force[4 * i + 2] += f[4 * i + 2];
}
}
void CpuConstantPotentialPmeIO::setChargeDerivatives(float* derivatives) {
for (int ii = 0; ii < numElectrodeParticles; ii++) {
chargeDerivatives[ii] += derivatives[ii];
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForceFvec.h"
#include "CpuNeighborList.h"
#include "openmm/OpenMMException.h"
#ifdef __AVX__
#include "openmm/internal/vectorizeAvx.h"
OpenMM::CpuConstantPotentialForce* createCpuConstantPotentialForceAvx() {
return new OpenMM::CpuConstantPotentialForceFvec<fvec8, ivec8>();
}
#else
OpenMM::CpuConstantPotentialForce* createCpuConstantPotentialForceAvx() {
throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX support");
}
#endif
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForceFvec.h"
#include "CpuNeighborList.h"
#include "openmm/OpenMMException.h"
#ifdef __AVX2__
#include "openmm/internal/vectorizeAvx2.h"
OpenMM::CpuConstantPotentialForce* createCpuConstantPotentialForceAvx2() {
return new OpenMM::CpuConstantPotentialForceFvec<fvecAvx2, ivec8>();
}
#else
OpenMM::CpuConstantPotentialForce* createCpuConstantPotentialForceAvx2() {
throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX2 support");
}
#endif
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForceFvec.h"
#include "CpuNeighborList.h"
#include "openmm/internal/hardware.h"
using namespace OpenMM;
CpuConstantPotentialForce* createCpuConstantPotentialForceVec4();
CpuConstantPotentialForce* createCpuConstantPotentialForceAvx();
CpuConstantPotentialForce* createCpuConstantPotentialForceAvx2();
CpuConstantPotentialForce* createCpuConstantPotentialForceVec() {
if (isAvx2Supported())
return createCpuConstantPotentialForceAvx2();
else if (isAvxSupported())
return createCpuConstantPotentialForceAvx();
else
return createCpuConstantPotentialForceVec4();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2025 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuConstantPotentialForceFvec.h"
#include "CpuNeighborList.h"
#include "openmm/internal/vectorize.h"
OpenMM::CpuConstantPotentialForce* createCpuConstantPotentialForceVec4() {
return new OpenMM::CpuConstantPotentialForceFvec<fvec4, ivec4>();
}
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2024 Stanford University and the Authors. * * Portions copyright (c) 2013-2025 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -52,6 +52,8 @@ KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform& ...@@ -52,6 +52,8 @@ KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform&
return new CpuCalcRBTorsionForceKernel(name, platform, data); return new CpuCalcRBTorsionForceKernel(name, platform, data);
if (name == CalcNonbondedForceKernel::Name()) if (name == CalcNonbondedForceKernel::Name())
return new CpuCalcNonbondedForceKernel(name, platform, data); return new CpuCalcNonbondedForceKernel(name, platform, data);
if (name == CalcConstantPotentialForceKernel::Name())
return new CpuCalcConstantPotentialForceKernel(name, platform, data);
if (name == CalcCustomNonbondedForceKernel::Name()) if (name == CalcCustomNonbondedForceKernel::Name())
return new CpuCalcCustomNonbondedForceKernel(name, platform, data); return new CpuCalcCustomNonbondedForceKernel(name, platform, data);
if (name == CalcCustomManyParticleForceKernel::Name()) if (name == CalcCustomManyParticleForceKernel::Name())
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "CpuKernels.h" #include "CpuKernels.h"
#include "ReferenceAngleBondIxn.h" #include "ReferenceAngleBondIxn.h"
#include "ReferenceBondForce.h" #include "ReferenceBondForce.h"
#include "ReferenceConstantPotential14.h"
#include "ReferenceConstraints.h" #include "ReferenceConstraints.h"
#include "ReferenceKernelFactory.h" #include "ReferenceKernelFactory.h"
#include "ReferenceKernels.h" #include "ReferenceKernels.h"
...@@ -44,6 +45,7 @@ ...@@ -44,6 +45,7 @@
#include "openmm/Vec3.h" #include "openmm/Vec3.h"
#include "openmm/internal/ContextImpl.h" #include "openmm/internal/ContextImpl.h"
#include "openmm/internal/NonbondedForceImpl.h" #include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/internal/ConstantPotentialForceImpl.h"
#include "openmm/internal/vectorize.h" #include "openmm/internal/vectorize.h"
#include "lepton/CompiledExpression.h" #include "lepton/CompiledExpression.h"
#include "lepton/CustomFunction.h" #include "lepton/CustomFunction.h"
...@@ -482,6 +484,8 @@ public: ...@@ -482,6 +484,8 @@ public:
force[4*i+2] += f[4*i+2]; force[4*i+2] += f[4*i+2];
} }
} }
void setChargeDerivatives(float* derivatives) {
}
private: private:
float* posq; float* posq;
float* force; float* force;
...@@ -646,7 +650,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -646,7 +650,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
useOptimizedPme = getPlatform().supportsKernels(kernelNames); useOptimizedPme = getPlatform().supportsKernels(kernelNames);
if (useOptimizedPme) { if (useOptimizedPme) {
optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context); optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha, data.deterministicForces); optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, {}, ewaldAlpha, data.deterministicForces);
} }
} }
if (nonbondedMethod == LJPME) { if (nonbondedMethod == LJPME) {
...@@ -658,7 +662,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -658,7 +662,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
useOptimizedPme = getPlatform().supportsKernels(kernelNames); useOptimizedPme = getPlatform().supportsKernels(kernelNames);
if (useOptimizedPme) { if (useOptimizedPme) {
optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context); optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha, data.deterministicForces); optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, {}, ewaldAlpha, data.deterministicForces);
optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context); optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1], optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
dispersionGridSize[2], numParticles, ewaldDispersionAlpha, data.deterministicForces); dispersionGridSize[2], numParticles, ewaldDispersionAlpha, data.deterministicForces);
...@@ -702,7 +706,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -702,7 +706,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if (useOptimizedPme) { if (useOptimizedPme) {
PmeIO io(&posq[0], &data.threadForce[0][0], numParticles); PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
Vec3 periodicBoxVectors[3] = {boxVectors[0], boxVectors[1], boxVectors[2]}; Vec3 periodicBoxVectors[3] = {boxVectors[0], boxVectors[1], boxVectors[2]};
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxVectors, includeEnergy); optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxVectors, includeEnergy, true, false);
nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io); nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
if (nonbondedMethod == LJPME) { if (nonbondedMethod == LJPME) {
copyChargesToPosq(context, C6params, ljPosqIndex); copyChargesToPosq(context, C6params, ljPosqIndex);
...@@ -898,6 +902,294 @@ void CpuCalcNonbondedForceKernel::computeParameters(ContextImpl& context, bool o ...@@ -898,6 +902,294 @@ void CpuCalcNonbondedForceKernel::computeParameters(ContextImpl& context, bool o
} }
} }
CpuConstantPotentialForce* createCpuConstantPotentialForceVec();
CpuCalcConstantPotentialForceKernel::CpuCalcConstantPotentialForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcConstantPotentialForceKernel(name, platform),
data(data), hasInitializedPme(false), constantPotential(NULL), solver(NULL) {
}
CpuCalcConstantPotentialForceKernel::~CpuCalcConstantPotentialForceKernel() {
if (constantPotential != NULL) {
delete constantPotential;
}
if (solver != NULL) {
delete solver;
}
}
void CpuCalcConstantPotentialForceKernel::initialize(const System& system, const ConstantPotentialForce& force) {
chargePosqIndex = data.requestPosqIndex();
// Get particle parameters.
numParticles = force.getNumParticles();
setCharges.resize(numParticles);
for (int i = 0; i < numParticles; i++) {
force.getParticleParameters(i, setCharges[i]);
}
// Get "1-4" exceptions (those that don't zero the charge product).
exclusions.resize(numParticles);
vector<int> nb14s;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd;
force.getExceptionParameters(i, particle1, particle2, chargeProd);
exclusions[particle1].insert(particle2);
exclusions[particle2].insert(particle1);
if (chargeProd != 0.0) {
nb14Index[i] = nb14s.size();
nb14s.push_back(i);
}
}
// Get exception parameters.
num14 = nb14s.size();
bonded14ParamArray.resize(num14, vector<double>(1));
bonded14IndexArray.resize(num14, vector<int>(2));
for (int i = 0; i < num14; ++i) {
int particle1, particle2;
force.getExceptionParameters(nb14s[i], particle1, particle2, bonded14ParamArray[i][0]);
bonded14IndexArray[i][0] = particle1;
bonded14IndexArray[i][1] = particle2;
}
bondForce.initialize(numParticles, num14, 2, bonded14IndexArray, data.threads);
// Get electrode parameters. sysToElec will be a map from system particle
// indices to electrode particle indices (or -1 if the particle is not an
// electrode particle), while elecToSys will be a map from electrode
// particle indices to system particle indices. sysElec will be a map from
// system particle indices to electrode indices (or -1 if the particle is
// not an electrode particle), while elecElec will be a map from electrode
// particle indices to electrode indices.
sysToElec.resize(numParticles, -1);
sysElec.resize(numParticles, -1);
electrodeParams.resize(force.getNumElectrodes());
for (int ie = 0; ie < force.getNumElectrodes(); ie++) {
std::set<int> electrodeParticles;
double potential;
double gaussianWidth;
double thomasFermiScale;
force.getElectrodeParameters(ie, electrodeParticles, potential, gaussianWidth, thomasFermiScale);
for (int i : electrodeParticles) {
sysToElec[i] = elecToSys.size();
sysElec[i] = ie;
elecToSys.push_back(i);
elecElec.push_back(ie);
}
electrodeParams[ie] = {potential, gaussianWidth, thomasFermiScale};
}
// Clear charges on electrode particles.
numElectrodeParticles = elecToSys.size();
for (int ii = 0; ii < numElectrodeParticles; ii++) {
setCharges[elecToSys[ii]] = 0.0;
}
// Initialize single-precision charge vector with initial guesses.
charges.resize(numParticles);
for (int i = 0; i < numParticles; i++) {
charges[i] = (float) setCharges[i];
}
// Set options from force.
nonbondedCutoff = force.getCutoffDistance();
data.requestNeighborList(nonbondedCutoff, 0.25 * nonbondedCutoff, true, exclusions);
ConstantPotentialForceImpl::calcPMEParameters(system, force, ewaldAlpha, gridSize[0], gridSize[1], gridSize[2]);
exceptionsArePeriodic = force.getExceptionsUsePeriodicBoundaryConditions();
cgErrorTol = force.getCGErrorTolerance();
useChargeConstraint = force.getUseChargeConstraint();
chargeTarget = force.getChargeConstraintTarget();
force.getExternalField(externalField);
data.isPeriodic = true;
// Set the charge target to be that on the electrode particles (so that the
// overall charge is constrained correctly if the non-electrolyte particles
// are non-neutral).
for (int i = 0; i < numParticles; i++) {
if (sysElec[i] == -1) {
chargeTarget -= setCharges[i];
}
}
// Set up constant potential.
ConstantPotentialForce::ConstantPotentialMethod method = force.getConstantPotentialMethod();
if (method == ConstantPotentialForce::Matrix) {
solver = new CpuConstantPotentialMatrixSolver(numParticles, numElectrodeParticles);
}
else if (method == ConstantPotentialForce::CG) {
solver = new CpuConstantPotentialCGSolver(numParticles, numElectrodeParticles, force.getUsePreconditioner());
}
else {
throw OpenMMException("internal error: invalid constant potential method");
}
constantPotential = createCpuConstantPotentialForceVec();
float externalFieldArray[3] = { (float) externalField[0], (float) externalField[1], (float) externalField[2] };
constantPotential->initialize(numParticles, numElectrodeParticles, chargePosqIndex, (float) nonbondedCutoff, (float) ewaldAlpha, (float) cgErrorTol,
gridSize, exceptionsArePeriodic, useChargeConstraint, *data.neighborList, solver, exclusions, sysToElec, elecToSys, sysElec, elecElec, electrodeParams, (float) chargeTarget, externalFieldArray);
}
double CpuCalcConstantPotentialForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
Vec3* boxVectors = extractBoxVectors(context);
vector<Vec3>& posData = extractPositions(context);
vector<Vec3>& forceData = extractForces(context);
double energy = 0.0;
checkBoxSize(boxVectors);
ensurePmeInitialized(context);
copyChargesToPosq(context, charges, chargePosqIndex);
constantPotential->execute(boxVectors, posData, charges, &data.posq[0], data.threadForce, includeEnergy ? &energy : NULL, data.threads, pmeKernel);
// Process non-zeroing exceptions. Since exceptions and electrodes should
// involve disjoint sets of atoms, this isn't required for the energy
// minimization with respect to the electrode atom charges.
ReferenceConstantPotential14 conp14;
if (exceptionsArePeriodic) {
conp14.setPeriodic(boxVectors);
}
bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, conp14);
return energy;
}
void CpuCalcConstantPotentialForceKernel::copyParametersToContext(ContextImpl& context, const ConstantPotentialForce& force, int firstParticle, int lastParticle, int firstException, int lastException, int firstElectrode, int lastElectrode) {
// Get particle parameters.
if (force.getNumParticles() != numParticles) {
throw OpenMMException("updateParametersInContext: The number of particles has changed");
}
for (int i = firstParticle; i <= lastParticle; i++) {
// Only update charges on non-electrode particles; keep current guesses
// for electrode particles.
if (sysElec[i] == -1) {
force.getParticleParameters(i, setCharges[i]);
charges[i] = (float) setCharges[i];
}
}
if (firstParticle <= lastParticle) {
// Signal that charges in posq need to be updated.
chargePosqIndex = data.requestPosqIndex();
}
// Get "1-4" (non-zeroing) exceptions.
vector<int> nb14s;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd;
force.getExceptionParameters(i, particle1, particle2, chargeProd);
if (nb14Index.find(i) == nb14Index.end()) {
if (chargeProd != 0.0) {
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
}
}
else {
nb14s.push_back(i);
}
}
if (nb14s.size() != num14) {
throw OpenMMException("updateParametersInContext: The number of non-excluded exceptions has changed");
}
// Get exception parameters.
for (int i = 0; i < num14; i++) {
int particle1, particle2;
force.getExceptionParameters(nb14s[i], particle1, particle2, bonded14ParamArray[i][0]);
bonded14IndexArray[i][0] = particle1;
bonded14IndexArray[i][1] = particle2;
}
// Get electrode parameters.
std::set<int> allElectrodeParticles;
for (int ie = 0; ie < force.getNumElectrodes(); ie++) {
std::set<int> electrodeParticles;
double potential;
double gaussianWidth;
double thomasFermiScale;
force.getElectrodeParameters(ie, electrodeParticles, potential, gaussianWidth, thomasFermiScale);
for (int i : electrodeParticles) {
if (sysElec[i] != ie) {
throw OpenMMException("updateParametersInContext: The electrode assignment of a particle has changed");
}
allElectrodeParticles.insert(i);
}
electrodeParams[ie][0] = potential;
electrodeParams[ie][1] = gaussianWidth;
electrodeParams[ie][2] = thomasFermiScale;
}
if (allElectrodeParticles.size() != numElectrodeParticles) {
throw OpenMMException("updateParametersInContext: The electrode state of a particle has changed");
}
// Update external field.
force.getExternalField(externalField);
// Update charge target.
chargeTarget = force.getChargeConstraintTarget();
for (int i = 0; i < numParticles; i++) {
if (sysElec[i] == -1) {
chargeTarget -= setCharges[i];
}
}
float externalFieldArray[3] = { (float) externalField[0], (float) externalField[1], (float) externalField[2] };
constantPotential->update((float) chargeTarget, externalFieldArray, firstElectrode, lastElectrode);
}
void CpuCalcConstantPotentialForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (hasInitializedPme) {
pmeKernel.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
} else {
alpha = ewaldAlpha;
nx = gridSize[0];
ny = gridSize[1];
nz = gridSize[2];
}
}
void CpuCalcConstantPotentialForceKernel::getCharges(ContextImpl& context, std::vector<double>& chargesOut) {
// Make sure that positions in posq, and the current neighbor list, are up
// to date, but don't compute any energies or forces, and exclude all force
// groups so that no kernels will get executed.
context.calcForcesAndEnergy(false, false, 0);
Vec3* boxVectors = extractBoxVectors(context);
vector<Vec3>& posData = extractPositions(context);
checkBoxSize(boxVectors);
ensurePmeInitialized(context);
copyChargesToPosq(context, charges, chargePosqIndex);
constantPotential->getCharges(boxVectors, posData, charges, &data.posq[0], data.threadForce, data.threads, pmeKernel);
// Preserve fixed charges exactly (without single-precision rounding) and
// load solved values of fluctuating charges.
chargesOut = setCharges;
for (int ii = 0; ii < numElectrodeParticles; ii++) {
int i = elecToSys[ii];
chargesOut[i] = (double) charges[i];
}
}
void CpuCalcConstantPotentialForceKernel::checkBoxSize(const Vec3* boxVectors) {
double minAllowedSize = 1.999999*nonbondedCutoff;
if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize) {
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
}
void CpuCalcConstantPotentialForceKernel::ensurePmeInitialized(ContextImpl& context) {
if (!hasInitializedPme) {
vector<string> kernelNames;
kernelNames.push_back("CalcPmeReciprocalForce");
if (!getPlatform().supportsKernels(kernelNames)) {
throw OpenMMException("ConstantPotentialForce unsupported on CPU platform without PME kernel");
}
pmeKernel = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
pmeKernel.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, elecToSys, ewaldAlpha, data.deterministicForces);
hasInitializedPme = true;
}
}
CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) { CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
} }
......
...@@ -27,18 +27,13 @@ ...@@ -27,18 +27,13 @@
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
#ifdef __AVX2__ #ifdef __AVX2__
#include "openmm/internal/vectorizeAvx2.h" #include "openmm/internal/vectorizeAvx2.h"
OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2(const OpenMM::CpuNeighborList& neighbors) { OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2(const OpenMM::CpuNeighborList& neighbors) {
return new OpenMM::CpuNonbondedForceFvec<fvecAvx2>(neighbors); return new OpenMM::CpuNonbondedForceFvec<fvecAvx2>(neighbors);
} }
#else #else
bool isAvx2Supported() {
return false;
}
OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2(const OpenMM::CpuNeighborList& neighbors) { OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2(const OpenMM::CpuNeighborList& neighbors) {
throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX2 support"); throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX2 support");
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment