Commit 8055a541 authored by Peter Eastman's avatar Peter Eastman
Browse files

Reduced memory use for exclusions. Also deleted some obsolete files.

parent b98859ec
......@@ -277,7 +277,6 @@ struct cudaGmxSimulation {
unsigned int stride2; // Atomic attributes stride x 2
unsigned int stride3; // Atomic attributes stride x 3
unsigned int stride4; // Atomic attributes stride x 4
unsigned int exclusionStride; // Exclusion list stride = stride / GRID
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers
unsigned int outputBuffers; // Number of output buffers
......@@ -357,6 +356,7 @@ struct cudaGmxSimulation {
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int* pExclusionIndex; // Index of exclusion data for each work unit
unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals
......
......@@ -1172,7 +1172,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->sim.stride2 = 2 * gpu->sim.stride;
gpu->sim.stride3 = 3 * gpu->sim.stride;
gpu->sim.stride4 = 4 * gpu->sim.stride;
gpu->sim.exclusionStride = gpu->sim.stride / GRID;
gpu->psPosqP4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pPosqP = gpu->psPosqP4->_pDevStream[0];
gpu->psOldPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
......@@ -1533,6 +1532,7 @@ void* gpuInit(int numAtoms)
gpu->psSettleID = NULL;
gpu->psSettleParameter = NULL;
gpu->psExclusion = NULL;
gpu->psExclusionIndex = NULL;
gpu->psWorkUnit = NULL;
gpu->psInteractingWorkUnit = NULL;
gpu->psInteractionFlag = NULL;
......@@ -1665,6 +1665,7 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psSettleID;
delete gpu->psSettleParameter;
delete gpu->psExclusion;
delete gpu->psExclusionIndex;
delete gpu->psWorkUnit;
delete gpu->psInteractingWorkUnit;
delete gpu->psInteractionFlag;
......@@ -1871,31 +1872,74 @@ void gpuBuildExclusionList(gpuContext gpu)
{
const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms+(grid-1))/grid;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>((atoms*atoms+grid-1) / grid, 1u);
gpu->psExclusion = psExclusion;
gpu->sim.pExclusion = psExclusion->_pDevStream[0];
unsigned int* pExList = psExclusion->_pSysStream[0];
const unsigned int dim = atoms/grid;
unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
for (int i = 0; i < psExclusion->_length; ++i)
pExList[i] = 0xFFFFFFFF;
// Fill in the exclusions.
// Mark which work units have exclusions.
for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
{
int x = atom1/grid;
int offset = atom1-x*grid;
for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
{
int atom2 = gpu->exclusions[atom1][j];
int y = atom2/grid;
int index = x*atoms+y*grid+offset;
pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
}
}
if (gpu->sim.paddedNumberOfAtoms > gpu->natoms)
{
int lastBlock = gpu->natoms/grid;
for (int i = 0; i < gpu->sim.workUnits; ++i)
{
int x = pWorkList[i]>>17;
int y = (pWorkList[i]>>2)&0x7FFF;
if (x == lastBlock || y == lastBlock)
pWorkList[i] |= 1;
}
}
// Build a list of indexes for the work units with exclusions.
CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u);
gpu->psExclusionIndex = psExclusionIndex;
unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
int numWithExclusions = 0;
for (int i = 0; i < psExclusionIndex->_length; ++i)
if ((pWorkList[i]&1) == 1)
pExclusionIndex[i] = (numWithExclusions++)*grid;
// Record the exclusion data.
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u);
gpu->psExclusion = psExclusion;
unsigned int* pExclusion = psExclusion->_pSysData;
gpu->sim.pExclusion = psExclusion->_pDevData;
for (int i = 0; i < psExclusion->_length; ++i)
pExclusion[i] = 0xFFFFFFFF;
for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
{
int x = atom1/grid;
int offset1 = atom1-x*grid;
for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
{
int atom2 = gpu->exclusions[atom1][j];
int y = atom2/grid;
int offset2 = atom2-y*grid;
if (x > y)
{
int cell = x+y*dim-y*(y+1)/2;
pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
else
{
int cell = y+x*dim-x*(x+1)/2;
pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
......@@ -1907,16 +1951,22 @@ void gpuBuildExclusionList(gpuContext gpu)
{
int y = atom2/grid;
int index = x*atoms+y*grid+offset1;
pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
int offset2 = atom2-y*grid;
index = y*atoms+x*grid+offset2;
pExList[index] &= 0xFFFFFFFF-(1<<(atom1-x*grid));
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
if (x >= y)
{
int cell = x+y*dim-y*(y+1)/2;
pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x)
{
int cell = y+x*dim-x*(x+1)/2;
pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
psExclusion->Upload();
psExclusionIndex->Upload();
gpu->psWorkUnit->Upload();
gpuSetConstants(gpu);
}
......
......@@ -116,6 +116,7 @@ struct _gpuContext {
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psExclusionIndex;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
......
......@@ -126,7 +126,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
}
else // bExclusion
{
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
......@@ -253,8 +255,11 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float fx;
float fy;
float fz;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForces_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJForces_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJForces_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateCDLJForces_12(gpuContext gpu)
{
// printf("kCalculateCDLJForces_12\n");
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces_12");
}
......@@ -134,7 +134,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
}
else // bExclusion
{
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int xi = x>>GRIDBITS;
int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
......@@ -299,8 +301,11 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
}
else // bExclusion
{
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float br;
float fx;
float fy;
float fz;
float fb;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJObcGbsaForces1_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
float q2 = apos.w * cSim.preFactor;
apos.w *= cSim.epsfac;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
int j = y + tgx;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
excl = (excl >> tgx) | (excl << (GRID - tgx));
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu)
{
// printf("kCalculateCDLJObcGbsaForces1_12\n");
kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJObcGbsaForces1_12");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sr2;
float fx;
float fy;
float fz;
float fb;
// float sum;
};
__shared__ Atom sA[GT2XX_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces2_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float l_ijI = 1.0f / max(psA[tj].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = log (u_ijJ / l_ijJ);
float t1I = log (u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
// Atom I Born sum term
// term = l_ijJ - u_ijJ +
// -0.25f * r * t2J +
// 0.50f * t1J +
// (0.25f * psA[tj].sr2) * t3J;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * (oneOverR - l_ijJ);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[tj].fb * term;
// term = l_ijI - u_ijI +
// -0.25f * r * t2I +
// 0.50f * t1I +
// (0.25f * sr2) * t3I;
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.bornForce2_workBlock;
}
}
void kCalculateObcGbsaForces2_12(gpuContext gpu)
{
// printf("kCalculateObcGbsaForces2_12\n");
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
LAUNCHERROR("kCalculateObcGbsaForces2_12");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment