Commit 2e451b9d authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted the old CUDA platform

parent 352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
__global__ void kPrepareConstraints_kernel(int numAtoms, float4* oldPos, float4* posq, float4* posqP) {
for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numAtoms; index += blockDim.x*gridDim.x) {
float4 pos = posq[index];
oldPos[index] = pos;
posqP[index] = make_float4(0.0f, 0.0f, 0.0f, pos.w);
}
}
__global__ void kFinishConstraints_kernel(int numAtoms, float4* posq, float4* posqP) {
for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numAtoms; index += blockDim.x*gridDim.x) {
float4 pos = posq[index];
float4 delta = posqP[index];
posq[index] = make_float4(pos.x+delta.x, pos.y+delta.y, pos.z+delta.z, pos.w);
}
}
void kApplyConstraints(gpuContext gpu)
{
kPrepareConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pOldPosq, gpu->sim.pPosq, gpu->sim.pPosqP);
LAUNCHERROR("kPrepareConstraints");
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
kFinishConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pPosq, gpu->sim.pPosqP);
LAUNCHERROR("kFinishConstraints");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetBrownianUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetBrownianUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 random4a = cSim.pRandom4[rpos + pos];
float4 apos = cSim.pPosq[pos];
float4 force = cSim.pForce4[pos];
float invMass = cSim.pVelm4[pos].w;
float forceScale = cSim.tauDeltaT*invMass;
float noiseScale = cSim.noiseAmplitude*sqrtf(invMass);
cSim.pOldPosq[pos] = apos;
apos.x = force.x*forceScale + noiseScale*random4a.x;
apos.y = force.y*forceScale + noiseScale*random4a.y;
apos.z = force.z*forceScale + noiseScale*random4a.z;
cSim.pPosqP[pos] = apos;
pos += blockDim.x * gridDim.x;
}
}
void kBrownianUpdatePart1(gpuContext gpu)
{
// printf("kBrownianUpdatePart1\n");
kBrownianUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kBrownianUpdatePart1");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kBrownianUpdatePart2(gpuContext gpu)
{
// printf("kBrownianUpdatePart2\n");
kBrownianUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kBrownianUpdatePart2");
// Update randoms if necessary
gpu->iterations++;
if (gpu->iterations == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
gpu->iterations = 0;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <cuda.h>
#include <vector_functions.h>
#include <vector>
#include "gputypes.h"
using namespace std;
static __constant__ cudaGmxSimulation cSim;
void SetCCMASim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCCMASim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintDirections()
{
// Calculate the direction of each constraint.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
int2 atoms = cSim.pCcmaAtoms[index];
float4 dir = cSim.pCcmaDistance[index];
float4 oldPos1 = cSim.pOldPosq[atoms.x];
float4 oldPos2 = cSim.pOldPosq[atoms.y];
dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z;
cSim.pCcmaDistance[index] = dir;
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
{
__shared__ int converged;
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
if (threadIdx.x == 0)
converged = 1;
__syncthreads();
// Calculate the constraint force for each constraint.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
int2 atoms = cSim.pCcmaAtoms[index];
float4 delta1 = atomPositions[atoms.x];
float4 delta2 = atomPositions[atoms.y];
float4 dir = cSim.pCcmaDistance[index];
float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
if (addOldPosition)
{
rp_ij.x += dir.x;
rp_ij.y += dir.y;
rp_ij.z += dir.z;
}
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w;
float diff = dist2 - rp2;
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
float reducedMass = cSim.pCcmaReducedMass[index];
cSim.pCcmaDelta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
// See whether it has converged.
if (converged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
{
converged = 0;
*cSim.ccmaConvergedDeviceMarker = 0;
}
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kMultiplyByCCMAConstraintMatrix()
{
if (*cSim.ccmaConvergedDeviceMarker)
return; // The constraint iteration has already converged
// Multiply by the inverse constraint matrix.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
float sum = 0.0f;
for (unsigned int i = 0; ; i++)
{
unsigned int element = index+i*cSim.ccmaConstraints;
unsigned int column = cSim.pConstraintMatrixColumn[element];
if (column >= cSim.ccmaConstraints)
break;
sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[element];
}
cSim.pCcmaDelta2[index] = sum;
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
{
if (*cSim.ccmaConvergedDeviceMarker)
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f);
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
{
float4 atomPos = atomPositions[index];
float invMass = cSim.pVelm4[index].w;
int num = cSim.pCcmaNumAtomConstraints[index];
for (int i = 0; i < num; i++)
{
int constraint = cSim.pCcmaAtomConstraints[index+i*cSim.atoms];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = cSim.pCcmaDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z;
}
atomPositions[index] = atomPos;
}
}
void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{
kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
LAUNCHERROR("kComputeCCMAConstraintDirections");
const int checkInterval = 3;
for (int i = 0; i < 150; i++) {
if ((i+1)%checkInterval == 0)
*gpu->ccmaConvergedHostMarker = 1;
kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
cudaEventRecord(gpu->ccmaEvent, 0);
kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, 3*i+2);
cudaEventSynchronize(gpu->ccmaEvent);
if ((i+1)%checkInterval == 0 && *gpu->ccmaConvergedHostMarker)
break;
}
}
void kApplyCCMA(gpuContext gpu)
{
// printf("kApplyCCMA\n");
if (gpu->sim.ccmaConstraints > 0)
kApplyCCMA(gpu, gpu->sim.pPosqP, true);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetCalculateAndersenThermostatSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateAndersenThermostatSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateAndersenThermostat_kernel(int* atomGroups)
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
float collisionProbability = 1.0f-exp(-cSim.collisionFrequency*cSim.pStepSize[0].y);
float randomRange = erf(collisionProbability/sqrtf(2.0f));
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 selectRand = cSim.pRandom4[rpos + atomGroups[pos]];
float4 velRand = cSim.pRandom4[rpos + pos];
float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f);
float add = (1.0f-scale)*sqrtf(cSim.kT*velocity.w);
velocity.x = scale*velocity.x + add*velRand.x;
velocity.y = scale*velocity.y + add*velRand.y;
velocity.z = scale*velocity.z + add*velRand.z;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups)
{
// printf("kCalculateAndersenThermostat\n");
kCalculateAndersenThermostat_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(atomGroups._pDevData);
LAUNCHERROR("kCalculateAndersenThermostat");
// Update randoms if necessary
gpu->iterations++;
if (gpu->iterations == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
gpu->iterations = 0;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Rossen P. Apostolov, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evaluating nonbonded forces using the
* Ewald summation method (Reciprocal space summation).
*/
/* Define multiply operations for floats */
__device__ float2 MultofFloat2(float2 a, float2 b)
{
float2 c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__device__ float2 ConjMultofFloat2(float2 a, float2 b)
{
float2 c;
c.x = a.x*b.x + a.y*b.y;
c.y = a.y*b.x - a.x*b.y;
return c;
}
/**
* Precompute the cosine and sine sums which appear in each force term.
*/
__global__ void kCalculateEwaldFastCosSinSums_kernel()
{
const float epsilon = 1.0;
const float recipCoeff = cSim.epsfac*(4*LOCAL_HACK_PI/cSim.cellVolume/epsilon);
const unsigned int ksizex = 2*cSim.kmaxX-1;
const unsigned int ksizey = 2*cSim.kmaxY-1;
const unsigned int ksizez = 2*cSim.kmaxZ-1;
const unsigned int totalK = ksizex*ksizey*ksizez;
unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
float energy = 0.0f;
while (index < (cSim.kmaxY-1)*ksizez+cSim.kmaxZ)
index += blockDim.x * gridDim.x;
while (index < totalK)
{
// Find the wave vector (kx, ky, kz) this index corresponds to.
int rx = index/(ksizey*ksizez);
int remainder = index - rx*ksizey*ksizez;
int ry = remainder/ksizez;
int rz = remainder - ry*ksizez - cSim.kmaxZ + 1;
ry += -cSim.kmaxY + 1;
float kx = rx*cSim.recipBoxSizeX;
float ky = ry*cSim.recipBoxSizeY;
float kz = rz*cSim.recipBoxSizeZ;
// Compute the sum for this wave vector.
float2 sum = make_float2(0.0f, 0.0f);
for (int atom = 0; atom < cSim.atoms; atom++)
{
float4 apos = cSim.pPosq[atom];
float phase = apos.x*kx;
float2 structureFactor = make_float2(cosf(phase), sinf(phase));
phase = apos.y*ky;
structureFactor = MultofFloat2(structureFactor, make_float2(cosf(phase), sinf(phase)));
phase = apos.z*kz;
structureFactor = MultofFloat2(structureFactor, make_float2(cosf(phase), sinf(phase)));
sum.x += apos.w*structureFactor.x;
sum.y += apos.w*structureFactor.y;
}
cSim.pEwaldCosSinSum[index] = sum;
// Compute the contribution to the energy.
float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*cSim.factorEwald) / k2;
energy += recipCoeff*ak*(sum.x*sum.x + sum.y*sum.y);
index += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
* previous routine.
*/
__global__ void kCalculateEwaldFastForces_kernel()
{
const float epsilon = 1.0;
float recipCoeff = cSim.epsfac*(4*LOCAL_HACK_PI/cSim.cellVolume/epsilon);
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
float4 force = cSim.pForce4[atom];
float4 apos = cSim.pPosq[atom];
// Loop over all wave vectors.
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < cSim.kmaxX; rx++) {
float kx = rx * cSim.recipBoxSizeX;
for (int ry = lowry; ry < cSim.kmaxY; ry++) {
float ky = ry * cSim.recipBoxSizeY;
float phase = apos.x*kx;
float2 tab_xy = make_float2(cosf(phase), sinf(phase));
phase = apos.y*ky;
tab_xy = MultofFloat2(tab_xy, make_float2(cosf(phase), sinf(phase)));
for (int rz = lowrz; rz < cSim.kmaxZ; rz++) {
float kz = rz * cSim.recipBoxSizeZ;
// Compute the force contribution of this wave vector.
int index = rx*(cSim.kmaxY*2-1)*(cSim.kmaxZ*2-1) + (ry+cSim.kmaxY-1)*(cSim.kmaxZ*2-1) + (rz+cSim.kmaxZ-1);
float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*cSim.factorEwald) / k2;
phase = apos.z*kz;
float2 structureFactor = MultofFloat2(tab_xy, make_float2(cosf(phase), sinf(phase)));
float2 cosSinSum = cSim.pEwaldCosSinSum[index];
float dEdR = ak * apos.w * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x);
force.x += 2 * recipCoeff * dEdR * kx;
force.y += 2 * recipCoeff * dEdR * ky;
force.z += 2 * recipCoeff * dEdR * kz;
lowrz = 1 - cSim.kmaxZ;
}
lowry = 1 - cSim.kmaxY;
}
}
// Record the force on the atom.
cSim.pForce4[atom] = force;
atom += blockDim.x * gridDim.x;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Rossen P. Apostolov, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evaluating nonbonded forces using the
* Ewald summation method (Reciprocal space summation).
*/
__global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
{
const float eps0 = 1.0f/(4.0f*3.1415926535f*cSim.epsfac);
unsigned int atomID1 = threadIdx.x + blockIdx.x * blockDim.x;
while (atomID1 < cSim.atoms)
{
float4 apos1 = cSim.pPosq[atomID1];
float4 af = cSim.pForce4[atomID1];
unsigned int atomID2 = 0;
while (atomID2 < cSim.atoms)
{
float4 apos2 = cSim.pPosq[atomID2];
float scale = 2.0f*apos1.w*apos2.w/(cSim.cellVolume*eps0);
int lowry = 0;
int lowrz = 1;
for(int rx = 0; rx < cSim.kmaxX; rx++)
{
float kx = rx*cSim.recipBoxSizeX;
for(int ry = lowry; ry < cSim.kmaxY; ry++)
{
float ky = ry*cSim.recipBoxSizeY;
for (int rz = lowrz; rz < cSim.kmaxZ; rz++)
{
float kz = rz*cSim.recipBoxSizeZ;
float k2 = kx*kx + ky*ky + kz*kz;
float ek = exp(k2*cSim.factorEwald);
float arg1 = kx*apos1.x + ky*apos1.y + kz*apos1.z;
float arg2 = kx*apos2.x + ky*apos2.y + kz*apos2.z;
float sinI = sinf(arg1);
float sinJ = sinf(arg2);
float cosI = cosf(arg1);
float cosJ = cosf(arg2);
float f = scale * ek * (-sinI*cosJ + cosI*sinJ) / k2;
af.x -= kx*f;
af.y -= ky*f;
af.z -= kz*f;
lowrz = 1 - cSim.kmaxZ;
}
lowry = 1 - cSim.kmaxY;
}
}
atomID2++;
}
cSim.pForce4[atomID1] = af;
atomID1 += blockDim.x * gridDim.x;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float fx;
float fy;
float fz;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
static __device__ float fastErfc(float r)
{
float normalized = cSim.tabulatedErfcScale*r;
int index = (int) normalized;
float fract2 = normalized-index;
float fract1 = 1.0f-fract2;
return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels for Ewald
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define USE_EWALD
#define METHOD_NAME(a, b) a##Ewald##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##EwaldByWarp##b
#include "kCalculateCDLJForces.h"
// Reciprocal Space Ewald summation is in a separate kernel
#include "kCalculateCDLJEwaldFastReciprocal.h"
void kCalculatePME(gpuContext gpu);
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJCutoffForces\n");
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJN2Forces");
break;
case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
break;
case EWALD:
case PARTICLE_MESH_EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldForces");
if (gpu->sim.nonbondedMethod == EWALD)
{
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
}
else
kCalculatePME(gpu);
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for evalauating nonbonded forces. It is included
* several times in kCalculateCDLJForces.cu with different #defines to generate
* different versions of the kernels.
*/
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float CDLJ_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
volatile float3* tempBuffer = (volatile float3*) &sA[cSim.nonbond_threads_per_block];
#endif
#ifdef USE_EWALD
const float TWO_OVER_SQRT_PI = 2.0f/sqrtf(LOCAL_HACK_PI);
#endif
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
apos.w *= cSim.epsfac;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrtf(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
/* E */
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
}
#endif
/* E */
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
}
else // bExclusion
{
unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrtf(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
if (needCorrection)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJ_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
}
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
if (!needCorrection && (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr))
#else
if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
#endif
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
}
/* E */
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
}
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
apos.w *= cSim.epsfac;
if (!bExclusionFlag)
{
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrtf(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else
dEdR += apos.w * psA[tj].q * invR;
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
}
#endif
/* E */
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrtf(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
}
#endif
/* E */
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x].x = dx;
tempBuffer[threadIdx.x].y = dy;
tempBuffer[threadIdx.x].z = dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrtf(r2);
float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
if (needCorrection)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJ_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
}
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif
#else
dEdR += apos.w * psA[tj].q * invR;
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
if (!needCorrection && (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr))
#else
if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
#endif
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
}
/* E */
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#include "cudaKernels.h"
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float br;
float fx;
float fy;
float fz;
float fb;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernel for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernels for Ewald
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define USE_EWALD
#define METHOD_NAME(a, b) a##Ewald##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##EwaldByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
extern __global__ void kCalculateEwaldFastCosSinSums_kernel();
extern __global__ void kCalculateEwaldFastForces_kernel();
extern void kCalculatePME(gpuContext gpu);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
// printf("kCalculateCDLJObcGbsaForces1\n");
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bRecalculateBornRadii)
{
if( gpu->bIncludeGBVI ){
kCalculateGBVIBornSum(gpu);
kReduceGBVIBornSum(gpu);
} else {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break;
case CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii)
{
if( gpu->bIncludeGBVI ){
kCalculateGBVIBornSum(gpu);
kReduceGBVIBornSum(gpu);
} else {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bRecalculateBornRadii)
{
if( gpu->bIncludeGBVI ){
kCalculateGBVIBornSum(gpu);
kReduceGBVIBornSum(gpu);
} else {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
}
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
* It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float CDLJObcGbsa_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else
float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br;
}
} else {
unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else
float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#if defined USE_CUTOFF
if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || x+j >= cSim.atoms )
#endif
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br;
excl >>= 1;
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
of.w += af.w;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w;
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
}
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
sA[threadIdx.x].fb = 0.0f;
float q2 = apos.w * cSim.preFactor;
apos.w *= cSim.epsfac;
if (!bExclusionFlag)
{
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else
float factorX = apos.w * psA[tj].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
psA[tj].fb += dGpol_dalpha2_ij * br;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else
float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br;
tempBuffer[threadIdx.x] = dx;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dy;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dz;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
// Sum the Born forces.
tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
} else {
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else
float factorX = apos.w * psA[tj].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#ifdef USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
psA[tj].fb += dGpol_dalpha2_ij * br;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
of.w += af.w;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
of.w += sA[threadIdx.x].fb;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = of.w;
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrtf(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
if (dp > 0.99f || dp < -0.99f) { \
float3 cross = CROSS_PRODUCT(v1, v2); \
float scale = DOT3(v1, v1)*DOT3(v2, v2); \
angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
if (dp < 0.0f) \
angle = LOCAL_HACK_PI-angle; \
} \
else { \
angle = acosf(dp); \
} \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
cp0 = CROSS_PRODUCT(vector1, vector2); \
cp1 = CROSS_PRODUCT(vector2, vector3); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateCMAPTorsionForces_kernel(int numAtoms, int numTorsions, float4* forceBuffers, float* energyBuffer,
float4* posq, float4* coeff, int2* mapPositions, int4* indices, int* maps)
{
const float PI = 3.14159265358979323846f;
float energy = 0.0f;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numTorsions; index += blockDim.x*gridDim.x) {
int4 atoms1 = indices[4*index];
int4 atoms2 = indices[4*index+1];
int4 atoms3 = indices[4*index+2];
int4 atoms4 = indices[4*index+3];
float4 a1 = posq[atoms1.x];
float4 a2 = posq[atoms1.y];
float4 a3 = posq[atoms1.z];
float4 a4 = posq[atoms1.w];
float4 b1 = posq[atoms2.x];
float4 b2 = posq[atoms2.y];
float4 b3 = posq[atoms2.z];
float4 b4 = posq[atoms2.w];
// Compute the first angle.
float3 v0a = make_float3(a1.x-a2.x, a1.y-a2.y, a1.z-a2.z);
float3 v1a = make_float3(a3.x-a2.x, a3.y-a2.y, a3.z-a2.z);
float3 v2a = make_float3(a3.x-a4.x, a3.y-a4.y, a3.z-a4.z);
float3 cp0a, cp1a;
float angleA;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0a, v1a, v2a, v0a, cp0a, cp1a, angleA);
angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
// Compute the second angle.
float3 v0b = make_float3(b1.x-b2.x, b1.y-b2.y, b1.z-b2.z);
float3 v1b = make_float3(b3.x-b2.x, b3.y-b2.y, b3.z-b2.z);
float3 v2b = make_float3(b3.x-b4.x, b3.y-b4.y, b3.z-b4.z);
float3 cp0b, cp1b;
float angleB;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0b, v1b, v2b, v0b, cp0b, cp1b, angleB);
angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
// Identify which patch this is in.
int2 pos = mapPositions[maps[index]];
int size = pos.y;
float delta = 2.0f*PI/size;
int s = (int) (angleA/delta);
int t = (int) (angleB/delta);
float4 c[4];
int coeffIndex = pos.x+4*(s+size*t);
c[0] = coeff[coeffIndex];
c[1] = coeff[coeffIndex+1];
c[2] = coeff[coeffIndex+2];
c[3] = coeff[coeffIndex+3];
float da = angleA/delta-s;
float db = angleB/delta-t;
// Evaluate the spline to determine the energy and gradients.
float torsionEnergy = 0.0f;
float dEdA = 0.0f;
float dEdB = 0.0f;
torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
torsionEnergy = da*torsionEnergy + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;
dEdA = db*dEdA + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;
dEdB = da*dEdB + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;
torsionEnergy = da*torsionEnergy + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;
dEdA = db*dEdA + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;
dEdB = da*dEdB + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;
torsionEnergy = da*torsionEnergy + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;
dEdA = db*dEdA + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;
dEdB = da*dEdB + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;
dEdA /= delta;
dEdB /= delta;
energy += torsionEnergy;
// Apply the force to the first torsion.
float normCross1 = DOT3(cp0a, cp0a);
float normSqrBC = DOT3(v1a, v1a);
float normBC = sqrtf(normSqrBC);
float normCross2 = DOT3(cp1a, cp1a);
float dp = 1.0f/normSqrBC;
float4 ff = make_float4((-dEdA*normBC)/normCross1, DOT3(v0a, v1a)*dp, DOT3(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
float3 internalF0 = make_float3(ff.x*cp0a.x, ff.x*cp0a.y, ff.x*cp0a.z);
float3 internalF3 = make_float3(ff.w*cp1a.x, ff.w*cp1a.y, ff.w*cp1a.z);
float3 d = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
ff.y*internalF0.y - ff.z*internalF3.y,
ff.y*internalF0.z - ff.z*internalF3.z);
unsigned int offsetA = atoms1.x+atoms3.x*numAtoms;
unsigned int offsetB = atoms1.y+atoms3.y*numAtoms;
unsigned int offsetC = atoms1.z+atoms3.z*numAtoms;
unsigned int offsetD = atoms1.w+atoms3.w*numAtoms;
float4 forceA = forceBuffers[offsetA];
float4 forceB = forceBuffers[offsetB];
float4 forceC = forceBuffers[offsetC];
float4 forceD = forceBuffers[offsetD];
forceA.x += internalF0.x;
forceA.y += internalF0.y;
forceA.z += internalF0.z;
forceB.x += d.x-internalF0.x;
forceB.y += d.y-internalF0.y;
forceB.z += d.z-internalF0.z;
forceC.x += -d.x-internalF3.x;
forceC.y += -d.y-internalF3.y;
forceC.z += -d.z-internalF3.z;
forceD.x += internalF3.x;
forceD.y += internalF3.y;
forceD.z += internalF3.z;
forceBuffers[offsetA] = forceA;
forceBuffers[offsetB] = forceB;
forceBuffers[offsetC] = forceC;
forceBuffers[offsetD] = forceD;
// Apply the force to the second torsion.
normCross1 = DOT3(cp0b, cp0b);
normSqrBC = DOT3(v1b, v1b);
normBC = sqrtf(normSqrBC);
normCross2 = DOT3(cp1b, cp1b);
dp = 1.0f/normSqrBC;
ff = make_float4((-dEdB*normBC)/normCross1, DOT3(v0b, v1b)*dp, DOT3(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
internalF0 = make_float3(ff.x*cp0b.x, ff.x*cp0b.y, ff.x*cp0b.z);
internalF3 = make_float3(ff.w*cp1b.x, ff.w*cp1b.y, ff.w*cp1b.z);
d = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
ff.y*internalF0.y - ff.z*internalF3.y,
ff.y*internalF0.z - ff.z*internalF3.z);
offsetA = atoms2.x+atoms4.x*numAtoms;
offsetB = atoms2.y+atoms4.y*numAtoms;
offsetC = atoms2.z+atoms4.z*numAtoms;
offsetD = atoms2.w+atoms4.w*numAtoms;
forceA = forceBuffers[offsetA];
forceB = forceBuffers[offsetB];
forceC = forceBuffers[offsetC];
forceD = forceBuffers[offsetD];
forceA.x += internalF0.x;
forceA.y += internalF0.y;
forceA.z += internalF0.z;
forceB.x += d.x-internalF0.x;
forceB.y += d.y-internalF0.y;
forceB.z += d.z-internalF0.z;
forceC.x += -d.x-internalF3.x;
forceC.y += -d.y-internalF3.y;
forceC.z += -d.z-internalF3.z;
forceD.x += internalF3.x;
forceD.y += internalF3.y;
forceD.z += internalF3.z;
forceBuffers[offsetA] = forceA;
forceBuffers[offsetB] = forceB;
forceBuffers[offsetC] = forceC;
forceBuffers[offsetD] = forceD;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps)
{
kCalculateCMAPTorsionForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block>>>(gpu->sim.stride,
torsionMaps._length, gpu->sim.pForce4, gpu->sim.pEnergy, gpu->sim.pPosq, coefficients._pDevData,
mapPositions._pDevData, torsionIndices._pDevData, torsionMaps._pDevData);
LAUNCHERROR("kCalculateCMAPTorsionForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ Expression<256> forceExp;
static __constant__ Expression<256> energyExp;
#include "kEvaluateExpression.h"
void SetCalculateCustomAngleForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCustomAngleForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
void SetCustomAngleForceExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
RTERROR(status, "SetCustomAngleForceExpression: cudaMemcpyToSymbol failed");
}
void SetCustomAngleEnergyExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
RTERROR(status, "SetCustomAngleEnergyExpression: cudaMemcpyToSymbol failed");
}
void SetCustomAngleGlobalParams(const vector<float>& paramValues)
{
cudaError_t status;
status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
RTERROR(status, "SetCustomAngleGlobalParams: cudaMemcpyToSymbol failed");
}
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kCalculateCustomAngleForces_kernel()
{
extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
float totalEnergy = 0.0f;
while (pos < cSim.customAngles)
{
int4 atom = cSim.pCustomAngleID1[pos];
int2 atom2 = cSim.pCustomAngleID2[pos];
float4 params = cSim.pCustomAngleParams[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float4 a3 = cSim.pPosq[atom.z];
float3 v0 = make_float3(a2.x-a1.x, a2.y-a1.y, a2.z-a1.z);
float3 v1 = make_float3(a2.x-a3.x, a2.y-a3.y, a2.z-a3.z);
float3 cp = CROSS_PRODUCT(v0, v1);
float rp = DOT3(cp, cp);
rp = max(sqrtf(rp), 1.0e-06f);
float r21 = DOT3(v0, v0);
float r23 = DOT3(v1, v1);
float dot = DOT3(v0, v1);
float cosine = max(-1.0f, min(1.0f, dot/sqrtf(r21*r23)));
VARIABLE(0) = acosf(cosine);
VARIABLE(1) = params.x;
VARIABLE(2) = params.y;
VARIABLE(3) = params.z;
VARIABLE(4) = params.w;
float dEdR = kEvaluateExpression_kernel(&forceExp, stack, variables);
totalEnergy += kEvaluateExpression_kernel(&energyExp, stack, variables);
float termA = dEdR/(r21*rp);
float termC = -dEdR/(r23*rp);
float3 c21 = CROSS_PRODUCT(v0, cp);
float3 c23 = CROSS_PRODUCT(v1, cp);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
unsigned int offsetA = atom.x + atom.w * cSim.stride;
unsigned int offsetB = atom.y + atom2.x * cSim.stride;
unsigned int offsetC = atom.z + atom2.y * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
float4 forceC = cSim.pForce4[offsetC];
forceA.x += c21.x;
forceA.y += c21.y;
forceA.z += c21.z;
forceB.x -= c21.x+c23.x;
forceB.y -= c21.y+c23.y;
forceB.z -= c21.z+c23.z;
forceC.x += c23.x;
forceC.y += c23.y;
forceC.z += c23.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
cSim.pForce4[offsetC] = forceC;
pos += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
void kCalculateCustomAngleForces(gpuContext gpu)
{
// printf("kCalculateCustomAngleForces\n");
int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
kCalculateCustomAngleForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
LAUNCHERROR("kCalculateCustomAngleForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ Expression<256> forceExp;
static __constant__ Expression<256> energyExp;
#include "kEvaluateExpression.h"
void SetCalculateCustomBondForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCustomBondForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
void SetCustomBondForceExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
RTERROR(status, "SetCustomBondForceExpression: cudaMemcpyToSymbol failed");
}
void SetCustomBondEnergyExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
RTERROR(status, "SetCustomBondEnergyExpression: cudaMemcpyToSymbol failed");
}
void SetCustomBondGlobalParams(const vector<float>& paramValues)
{
cudaError_t status;
status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
RTERROR(status, "SetCustomBondGlobalParams: cudaMemcpyToSymbol failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kCalculateCustomBondForces_kernel()
{
extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
float totalEnergy = 0.0f;
while (pos < cSim.customBonds)
{
int4 atom = cSim.pCustomBondID[pos];
float4 params = cSim.pCustomBondParams[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float dx = a1.x - a2.x;
float dy = a1.y - a2.y;
float dz = a1.z - a2.z;
float r = sqrtf(dx*dx + dy*dy + dz*dz);
float invR = 1.0f/r;
VARIABLE(0) = r;
VARIABLE(1) = params.x;
VARIABLE(2) = params.y;
VARIABLE(3) = params.z;
VARIABLE(4) = params.w;
float dEdR = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
float energy = kEvaluateExpression_kernel(&energyExp, stack, variables);
totalEnergy += energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
forceA.x += dx;
forceA.y += dy;
forceA.z += dz;
forceB.x -= dx;
forceB.y -= dy;
forceB.z -= dz;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
pos += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
void kCalculateCustomBondForces(gpuContext gpu)
{
// printf("kCalculateCustomBondForces\n");
int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
kCalculateCustomBondForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
LAUNCHERROR("kCalculateCustomBondForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ Expression<256> forceExpX;
static __constant__ Expression<256> forceExpY;
static __constant__ Expression<256> forceExpZ;
static __constant__ Expression<256> energyExp;
#include "kEvaluateExpression.h"
void SetCalculateCustomExternalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCustomExternalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ)
{
cudaError_t status;
status = cudaMemcpyToSymbol(forceExpX, &expressionX, sizeof(forceExpX));
status = cudaMemcpyToSymbol(forceExpY, &expressionY, sizeof(forceExpY));
status = cudaMemcpyToSymbol(forceExpZ, &expressionZ, sizeof(forceExpZ));
RTERROR(status, "SetCustomExternalForceExpression: cudaMemcpyToSymbol failed");
}
void SetCustomExternalEnergyExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
RTERROR(status, "SetCustomExternalEnergyExpression: cudaMemcpyToSymbol failed");
}
void SetCustomExternalGlobalParams(const vector<float>& paramValues)
{
cudaError_t status;
status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
RTERROR(status, "SetCustomExternalGlobalParams: cudaMemcpyToSymbol failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kCalculateCustomExternalForces_kernel()
{
extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
float totalEnergy = 0.0f;
while (index < cSim.customExternals)
{
int atom = cSim.pCustomExternalID[index];
float4 params = cSim.pCustomExternalParams[index];
float4 pos = cSim.pPosq[atom];
VARIABLE(0) = pos.x;
VARIABLE(1) = pos.y;
VARIABLE(2) = pos.z;
VARIABLE(3) = params.x;
VARIABLE(4) = params.y;
VARIABLE(5) = params.z;
VARIABLE(6) = params.w;
totalEnergy += kEvaluateExpression_kernel(&energyExp, stack, variables);;
float4 force = cSim.pForce4[atom];
force.x -= kEvaluateExpression_kernel(&forceExpX, stack, variables);
force.y -= kEvaluateExpression_kernel(&forceExpY, stack, variables);
force.z -= kEvaluateExpression_kernel(&forceExpZ, stack, variables);
cSim.pForce4[atom] = force;
index += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
void kCalculateCustomExternalForces(gpuContext gpu)
{
// printf("kCalculateCustomExternalForces\n");
int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
kCalculateCustomExternalForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
LAUNCHERROR("kCalculateCustomExternalForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float4 params;
float fx;
float fy;
float fz;
};
static __constant__ cudaGmxSimulation cSim;
static __constant__ Expression<256> forceExp;
static __constant__ Expression<256> energyExp;
#include "kEvaluateExpression.h"
void SetCalculateCustomNonbondedForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCustomNonbondedForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
void SetCustomNonbondedForceExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
RTERROR(status, "SetCustomNonbondedForceExpression: cudaMemcpyToSymbol failed");
}
void SetCustomNonbondedEnergyExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
RTERROR(status, "SetCustomNonbondedEnergyExpression: cudaMemcpyToSymbol failed");
}
void SetCustomNonbondedGlobalParams(const vector<float>& paramValues)
{
cudaError_t status;
status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
RTERROR(status, "SetCustomNonbondedGlobalParams: cudaMemcpyToSymbol failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCustomNonbondedForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCustomNonbondedForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCustomNonbondedForces.h"
__global__ void kFindBlockBoundsCutoff_kernel();
__global__ void kFindBlocksWithInteractionsCutoff_kernel();
__global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int* workUnit);
__global__ void kFindBlockBoundsPeriodic_kernel();
__global__ void kFindBlocksWithInteractionsPeriodic_kernel();
__global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int* workUnit);
void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
{
// printf("kCalculateCustomNonbondedCutoffForces\n");
if (gpu->tabulatedFunctionsChanged)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
if (gpu->tabulatedFunctions[0].coefficients != NULL)
cudaBindTexture(NULL, &texRef0, gpu->tabulatedFunctions[0].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[0].coefficients->_length*sizeof(float4));
if (gpu->tabulatedFunctions[1].coefficients != NULL)
cudaBindTexture(NULL, &texRef1, gpu->tabulatedFunctions[1].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[1].coefficients->_length*sizeof(float4));
if (gpu->tabulatedFunctions[2].coefficients != NULL)
cudaBindTexture(NULL, &texRef2, gpu->tabulatedFunctions[2].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[2].coefficients->_length*sizeof(float4));
if (gpu->tabulatedFunctions[3].coefficients != NULL)
cudaBindTexture(NULL, &texRef3, gpu->tabulatedFunctions[3].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[3].coefficients->_length*sizeof(float4));
gpu->tabulatedFunctionsChanged = false;
}
int sharedPerThread = sizeof(Atom)+gpu->sim.customExpressionStackSize*sizeof(float)+9*sizeof(float);
if (gpu->sim.customNonbondedMethod != NO_CUTOFF)
sharedPerThread += sizeof(float3);
int threads = gpu->sim.nonbond_threads_per_block;
int maxThreads = (gpu->sharedMemoryPerBlock-16)/sharedPerThread;
if (threads > maxThreads)
threads = (maxThreads/32)*32;
switch (gpu->sim.customNonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCustomNonbondedN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pWorkUnit);
else
kCalculateCustomNonbondedN2Forces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pWorkUnit);
LAUNCHERROR("kCalculateCustomNonbondedN2Forces");
break;
case CUTOFF:
if (!neighborListValid)
{
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
}
if (gpu->bOutputBufferPerWarp)
kCalculateCustomNonbondedCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCustomNonbondedCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCustomNonbondedCutoffForces");
break;
case PERIODIC:
if (!neighborListValid)
{
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
}
if (gpu->bOutputBufferPerWarp)
kCalculateCustomNonbondedPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCustomNonbondedPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCustomNonbondedPeriodicForces");
break;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for evalauating custom nonbonded forces. It is included
* several times in kCalculateCustomNonbondedForces.cu with different #defines to generate
* different versions of the kernels.
*/
__global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned int* workUnit)
{
extern __shared__ float stack[];
volatile Atom* sA = (volatile Atom*) &stack[cSim.customExpressionStackSize*blockDim.x];
float* variables = (float*) &sA[blockDim.x];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float totalEnergy = 0.0f;
#ifdef USE_CUTOFF
volatile float3* tempBuffer = (volatile float3*) &variables[9*blockDim.x];
#endif
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float4 params = cSim.pCustomParams[i];
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].params.x = params.x;
sA[threadIdx.x].params.y = params.y;
sA[threadIdx.x].params.z = params.z;
sA[threadIdx.x].params.w = params.w;
unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
// Record the parameters.
VARIABLE(0) = params.x;
VARIABLE(1) = params.y;
VARIABLE(2) = params.z;
VARIABLE(3) = params.w;
VARIABLE(4) = psA[j].params.x;
VARIABLE(5) = psA[j].params.y;
VARIABLE(6) = psA[j].params.z;
VARIABLE(7) = psA[j].params.w;
// Compute the force.
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r = sqrtf(dx*dx + dy*dy + dz*dz);
float invR = 1.0f/r;
VARIABLE(8) = r;
float dEdR = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
float energy = kEvaluateExpression_kernel(&energyExp, stack, variables);
#ifdef USE_CUTOFF
if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
energy = 0.0f;
}
totalEnergy += 0.5f*energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].params.x = cSim.pCustomParams[j].x;
sA[threadIdx.x].params.y = cSim.pCustomParams[j].y;
sA[threadIdx.x].params.z = cSim.pCustomParams[j].z;
sA[threadIdx.x].params.w = cSim.pCustomParams[j].w;
}
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
if (!bExclusionFlag)
{
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
// Record the parameters.
VARIABLE(0) = params.x;
VARIABLE(1) = params.y;
VARIABLE(2) = params.z;
VARIABLE(3) = params.w;
VARIABLE(4) = psA[tj].params.x;
VARIABLE(5) = psA[tj].params.y;
VARIABLE(6) = psA[tj].params.z;
VARIABLE(7) = psA[tj].params.w;
// Compute the force.
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r = sqrtf(dx*dx + dy*dy + dz*dz);
float invR = 1.0f/r;
VARIABLE(8) = r;
float dEdR = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
float energy = kEvaluateExpression_kernel(&energyExp, stack, variables);
#ifdef USE_CUTOFF
if (r > cSim.nonbondedCutoff)
{
dEdR = 0.0f;
energy = 0.0f;
}
#endif
totalEnergy += energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
// Record the parameters.
VARIABLE(0) = params.x;
VARIABLE(1) = params.y;
VARIABLE(2) = params.z;
VARIABLE(3) = params.w;
VARIABLE(4) = psA[j].params.x;
VARIABLE(5) = psA[j].params.y;
VARIABLE(6) = psA[j].params.z;
VARIABLE(7) = psA[j].params.w;
// Compute the force.
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r = sqrtf(dx*dx + dy*dy + dz*dz);
float invR = 1.0f/r;
VARIABLE(8) = r;
float dEdR = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
float energy = kEvaluateExpression_kernel(&energyExp, stack, variables);
#ifdef USE_CUTOFF
if (r > cSim.nonbondedCutoff)
{
dEdR = 0.0f;
energy = 0.0f;
}
#endif
totalEnergy += energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x].x = dx;
tempBuffer[threadIdx.x].y = dy;
tempBuffer[threadIdx.x].z = dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
// Record the parameters.
VARIABLE(0) = params.x;
VARIABLE(1) = params.y;
VARIABLE(2) = params.z;
VARIABLE(3) = params.w;
VARIABLE(4) = psA[tj].params.x;
VARIABLE(5) = psA[tj].params.y;
VARIABLE(6) = psA[tj].params.z;
VARIABLE(7) = psA[tj].params.w;
// Compute the force.
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r = sqrtf(dx*dx + dy*dy + dz*dz);
float invR = 1.0f/r;
VARIABLE(8) = r;
float dEdR = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
float energy = kEvaluateExpression_kernel(&energyExp, stack, variables);
#ifdef USE_CUTOFF
if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
energy = 0.0f;
}
totalEnergy += energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += totalEnergy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ Expression<256> forceExp;
static __constant__ Expression<256> energyExp;
#include "kEvaluateExpression.h"
void SetCalculateCustomTorsionForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCustomTorsionForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
void SetCustomTorsionForceExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
RTERROR(status, "SetCustomTorsionForceExpression: cudaMemcpyToSymbol failed");
}
void SetCustomTorsionEnergyExpression(const Expression<256>& expression)
{
cudaError_t status;
status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
RTERROR(status, "SetCustomTorsionEnergyExpression: cudaMemcpyToSymbol failed");
}
void SetCustomTorsionGlobalParams(const vector<float>& paramValues)
{
cudaError_t status;
status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
RTERROR(status, "SetCustomTorsionGlobalParams: cudaMemcpyToSymbol failed");
}
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrtf(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
if (dp > 0.99f || dp < -0.99f) { \
float3 cross = CROSS_PRODUCT(v1, v2); \
float scale = DOT3(v1, v1)*DOT3(v2, v2); \
angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
if (dp < 0.0f) \
angle = LOCAL_HACK_PI-angle; \
} \
else { \
angle = acosf(dp); \
} \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
cp0 = CROSS_PRODUCT(vector1, vector2); \
cp1 = CROSS_PRODUCT(vector2, vector3); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateCustomTorsionForces_kernel()
{
extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
float totalEnergy = 0.0f;
while (pos < cSim.customTorsions)
{
int4 atom = cSim.pCustomTorsionID1[pos];
int4 atom2 = cSim.pCustomTorsionID2[pos];
float4 params = cSim.pCustomTorsionParams[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float4 a3 = cSim.pPosq[atom.z];
float4 a4 = cSim.pPosq[atom.w];
float3 v0 = make_float3(a1.x-a2.x, a1.y-a2.y, a1.z-a2.z);
float3 v1 = make_float3(a3.x-a2.x, a3.y-a2.y, a3.z-a2.z);
float3 v2 = make_float3(a3.x-a4.x, a3.y-a4.y, a3.z-a4.z);
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0, v1, v2, v0, cp0, cp1, dihedralAngle);
VARIABLE(0) = dihedralAngle;
VARIABLE(1) = params.x;
VARIABLE(2) = params.y;
VARIABLE(3) = params.z;
VARIABLE(4) = params.w;
float dEdAngle = kEvaluateExpression_kernel(&forceExp, stack, variables);
totalEnergy += kEvaluateExpression_kernel(&energyExp, stack, variables);
float normBC = sqrtf(DOT3(v1, v1));
float dp = 1.0f / DOT3(v1, v1);
float4 ff = make_float4((-dEdAngle*normBC)/DOT3(cp0, cp0), DOT3(v0, v1)*dp, DOT3(v2, v1)*dp, (dEdAngle*normBC)/DOT3(cp1, cp1));
float3 internalF0 = make_float3(ff.x*cp0.x, ff.x*cp0.y, ff.x*cp0.z);
float3 internalF3 = make_float3(ff.w*cp1.x, ff.w*cp1.y, ff.w*cp1.z);
float3 s = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
ff.y*internalF0.y - ff.z*internalF3.y,
ff.y*internalF0.z - ff.z*internalF3.z);
unsigned int offsetA = atom.x + atom2.x * cSim.stride;
unsigned int offsetB = atom.y + atom2.y * cSim.stride;
unsigned int offsetC = atom.z + atom2.z * cSim.stride;
unsigned int offsetD = atom.w + atom2.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
float4 forceC = cSim.pForce4[offsetC];
float4 forceD = cSim.pForce4[offsetD];
forceA.x += internalF0.x;
forceA.y += internalF0.y;
forceA.z += internalF0.z;
forceB.x += -internalF0.x + s.x;
forceB.y += -internalF0.y + s.y;
forceB.z += -internalF0.z + s.z;
forceC.x += -internalF3.x - s.x;
forceC.y += -internalF3.y - s.y;
forceC.z += -internalF3.z - s.z;
forceD.x += internalF3.x;
forceD.y += internalF3.y;
forceD.z += internalF3.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
cSim.pForce4[offsetC] = forceC;
cSim.pForce4[offsetD] = forceD;
pos += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
void kCalculateCustomTorsionForces(gpuContext gpu)
{
// printf("kCalculateCustomTorsionForces\n");
int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
kCalculateCustomTorsionForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
LAUNCHERROR("kCalculateCustomTorsionForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Mark Friedrichs *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifndef __GpuGBVIAUX_H__
#define __GpuGBVIAUX_H__
/**
* This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
*/
static __device__ float getGBVI_L( float r, float x, float S )
{
float rInv = 1.0f/r;
float xInv = 1.0f/x;
float xInv2 = xInv*xInv;
float diff2 = (r + S)*(r - S);
return (1.5f*xInv2)*( (0.25f*rInv) - (xInv/3.0f) + (0.125f*diff2*xInv2*rInv) );
}
static __device__ float getGBVI_Volume( float r, float R, float S )
{
float addOn = 0.0f;
int mask = 1;
float lowerBound = (r - S);
float diff = (S - R);
if( fabsf( diff ) < r ){
lowerBound = R > lowerBound ? R : lowerBound;
} else if( r <= diff ){
addOn = (1.0f/(R*R*R));
} else {
mask = 0;
}
float s2 = getGBVI_L( r, lowerBound, S );
float s1 = getGBVI_L( r, (r + S), S );
s1 = mask ? (s1 - s2 + addOn) : 0.0f;
return s1;
}
static __device__ float getGBVI_dL_dr( float r, float x, float S )
{
float rInv = 1.0f/r;
float rInv2 = rInv*rInv;
float xInv = 1.0f/x;
float xInv2 = xInv*xInv;
float xInv3 = xInv2*xInv;
float diff2 = (r + S)*(r - S);
return ( (-1.5f*xInv2*rInv2)*( 0.25f + 0.125f*diff2*xInv2 ) + 0.375f*xInv3*xInv );
}
static __device__ float getGBVI_dL_drNew( float r, float x, float S )
{
float rInv = 1.0f/r;
float rInv2 = rInv*rInv;
float xInv = 1.0f/x;
float xInv2 = xInv*xInv;
float t1 = (S*rInv);
t1 = 1.0f + t1*t1;
return (-0.375f*xInv2)*( rInv2 - 0.5f*xInv2*t1 );
}
static __device__ float getGBVI_dL_dx( float r, float x, float S )
{
float rInv = 1.0f/r;
float xInv = 1.0f/x;
float xInv2 = xInv*xInv;
float xInv3 = xInv2*xInv;
float diff = (r + S)*(r - S);
return ( (-1.5f*xInv3)*( (0.5f*rInv) - xInv + (0.5f*diff*xInv2*rInv) ));
}
static __device__ float getGBVI_dE2Old( float r, float R, float S, float bornForce )
{
float diff = S - R;
float absDiff = fabsf( S - R );
float dE = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
float mask;
float lowerBound;
if( (R > (r - S)) && (absDiff < r) ){
mask = 0.0f;
lowerBound = R;
} else {
mask = 1.0f;
lowerBound = (r - S);
}
float dE2 = getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
dE -= (absDiff >= r) && r >= diff ? 0.0f : dE2;
dE = r < -diff ? 0.0f : dE;
dE *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
return (-dE);
}
static __device__ float getGBVI_dE2( float r, float R, float S, float bornForce )
{
float diff = S - R;
float dE = 0.0f;
if( fabsf( diff ) < r ){
dE = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
float lowerBound;
float mask;
if( R > (r - S) ){
lowerBound = R;
mask = 0.0f;
} else {
lowerBound = r - S;
mask = 1.0f;
}
dE -= getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
} else if( r < (S - R) ){
dE = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
dE -= ( getGBVI_dL_dr( r, r-S, S ) + getGBVI_dL_dx( r, r-S, S ) );
}
dE *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
return (-dE);
}
static __device__ float getGBVIBornForce2( float bornRadius, float R, float bornForce, float gamma )
{
float ratio = (R/bornRadius);
float returnBornForce = bornForce + (3.0f*gamma*ratio*ratio*ratio)/bornRadius; // 'cavity' term
float br2 = bornRadius*bornRadius;
returnBornForce *= (1.0f/3.0f)*br2*br2;
return returnBornForce;
}
#endif // __GpuGBVIAUX_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sum;
float gamma;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateGBVIBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateGBVIBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVIBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVIBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVIBornSum.h"
/**---------------------------------------------------------------------------------------
Compute quintic spline value and associated derviative
@param x value to compute spline at
@param rl lower cutoff value
@param ru upper cutoff value
@param outValue value of spline at x
@param outDerivative value of derivative of spline at x
--------------------------------------------------------------------------------------- */
static __device__ void quinticSpline_kernel( float x, float rl, float ru,
float* outValue, float* outDerivative ){
// ---------------------------------------------------------------------------------------
const float one = 1.0f;
const float minusSix = -6.0f;
const float minusTen = -10.0f;
const float minusThirty = -30.0f;
const float fifteen = 15.0f;
const float sixty = 60.0f;
// ---------------------------------------------------------------------------------------
float numerator = x - rl;
float denominator = ru - rl;
float ratio = numerator/denominator;
float ratio2 = ratio*ratio;
float ratio3 = ratio2*ratio;
*outValue = one + ratio3*(minusTen + fifteen*ratio + minusSix*ratio2);
*outDerivative = ratio2*(minusThirty + sixty*ratio + minusThirty*ratio2)/denominator;
}
/**---------------------------------------------------------------------------------------
Compute Born radii based on Eq. 3 of Labute paper [JCC 29 p. 1693-1698 2008])
and quintic splice switching function
@param atomicRadius3 atomic radius cubed
@param bornSum Born sum (volume integral)
@param bornRadius output Born radius
@param switchDeriviative output switching function deriviative
--------------------------------------------------------------------------------------- */
__device__ void computeBornRadiiUsingQuinticSpline( float atomicRadius3, float bornSum,
float* bornRadius, float* switchDeriviative ){
// ---------------------------------------------------------------------------------------
const float zero = 0.0f;
const float one = 1.0f;
const float minusOneThird = (-1.0f/3.0f);
// ---------------------------------------------------------------------------------------
// R = [ S(V)*(A - V) ]**(-1/3)
// S(V) = 1 V < L
// S(V) = qSpline + U/(A-V) L < V < A
// S(V) = U/(A-V) U < V
// dR/dr = (-1/3)*[ S(V)*(A - V) ]**(-4/3)*[ d{ S(V)*(A-V) }/dr
// d{ S(V)*(A-V) }/dr = (dV/dr)*[ (A-V)*dS/dV - S(V) ]
// (A - V)*dS/dV - S(V) = 0 - 1 V < L
// (A - V)*dS/dV - S(V) = (A-V)*d(qSpline) + (A-V)*U/(A-V)**2 - qSpline - U/(A-V)
// = (A-V)*d(qSpline) - qSpline L < V < A**(-3)
// (A - V)*dS/dV - S(V) = (A-V)*U*/(A-V)**2 - U/(A-V) = 0 U < V
float splineL = cSim.gbviQuinticLowerLimitFactor*atomicRadius3;
float sum;
if( bornSum > splineL ){
if( bornSum < atomicRadius3 ){
float splineValue, splineDerivative;
quinticSpline_kernel( bornSum, splineL, atomicRadius3, &splineValue, &splineDerivative );
sum = (atomicRadius3 - bornSum)*splineValue + cSim.gbviQuinticUpperBornRadiusLimit;
*switchDeriviative = splineValue - (atomicRadius3 - bornSum)*splineDerivative;
} else {
sum = cSim.gbviQuinticUpperBornRadiusLimit;
*switchDeriviative = zero;
}
} else {
sum = atomicRadius3 - bornSum;
*switchDeriviative = one;
}
*bornRadius = powf( sum, minusOneThird );
}
__global__ void kReduceGBVIBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float4 atom = cSim.pGBVIData[pos];
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius
float Rinv = 1.0f/atom.x;
Rinv = Rinv*Rinv*Rinv;
if( cSim.gbviBornRadiusScalingMethod == 0 ){
sum = Rinv - sum;
cSim.pBornRadii[pos] = powf( sum, (-1.0f/3.0f) );
cSim.pGBVISwitchDerivative[pos] = 1.0f;
} else {
float bornRadius;
float switchDeriviative;
computeBornRadiiUsingQuinticSpline( Rinv, sum, &bornRadius, &switchDeriviative );
cSim.pBornRadii[pos] = bornRadius;
cSim.pGBVISwitchDerivative[pos] = switchDeriviative;
}
pos += gridDim.x * blockDim.x;
}
}
void kReduceGBVIBornSum(gpuContext gpu)
{
//printf("kReduceGBVIBornSum\n");
kReduceGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>();
gpu->bRecalculateBornRadii = false;
LAUNCHERROR("kReduceGBVIBornSum");
}
static int isNanOrInfinity( float number ){
return (number != number || number == std::numeric_limits<float>::infinity() || number == -std::numeric_limits<float>::infinity()) ? 1 : 0;
}
void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log)
{
gpu->psGBVIData->Download();
gpu->psBornRadii->Download();
gpu->psGBVISwitchDerivative->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
gpu->psSigEps2->Download();
int printOnlyOnNan = 1;
int foundNan = 0;
if( printOnlyOnNan ){
for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms && foundNan == 0; ii++ ){
foundNan += isNanOrInfinity( gpu->psBornRadii->_pSysData[ii] );
foundNan += isNanOrInfinity( gpu->psBornForce->_pSysData[ii] );
foundNan += isNanOrInfinity( gpu->psGBVISwitchDerivative->_pSysData[ii] );
}
if( foundNan ){
log = stderr;
(void) fprintf( log, "kPrintGBVI found nan \n", gpu->sim.paddedNumberOfAtoms );
for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e\n", ii,
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z );
}
}
}
if( !printOnlyOnNan || foundNan ){
(void) fprintf( log, "kPrintGBVI Cuda comp bR bF prm sigeps2\n" );
(void) fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
(void) fprintf( stderr, "bR bF swd r scR ...\n" );
for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e\n", ii,
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii],
gpu->psGBVISwitchDerivative->_pSysData[ii],
gpu->psGBVIData->_pSysData[ii].x,
gpu->psGBVIData->_pSysData[ii].y,
gpu->psGBVIData->_pSysData[ii].z,
gpu->psGBVIData->_pSysData[ii].w,
gpu->psSigEps2->_pSysData[ii].x,
gpu->psSigEps2->_pSysData[ii].y );
}
if( foundNan ){
exit(0);
}
}
}
void kCalculateGBVIBornSum(gpuContext gpu)
{
//printf("kCalculateGBVIBornSum\n");
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp){
kCalculateGBVIN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
} else {
kCalculateGBVIN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
}
break;
case CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break;
case PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break;
}
LAUNCHERROR("kCalculateGBVIBornSum");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateGBVIBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
#include "kCalculateGBVIAux.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
while ( pos < end )
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
// forces tgx into interval [0,31]
// forces tbx 0
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float4 ar = cSim.pGBVIData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#if defined USE_CUTOFF
if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
#else
if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
#endif
{
r = sqrtf(r2);
apos.w += getGBVI_Volume( r, ar.x, psA[j].sr );
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
#endif
} else {
// Read fixed atom data into registers and GRF
unsigned int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float4 temp1 = cSim.pGBVIData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float4 ar = cSim.pGBVIData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = 0.0f;
apos.w = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+tj < cSim.atoms )
#endif
{
r = sqrtf(r2);
apos.w += getGBVI_Volume( r, ar.x, psA[tj].sr );
psA[tj].sum += getGBVI_Volume( r, psA[tj].r, ar.y );
}
tj = (tj - 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
tempBuffer[threadIdx.x] = 0.0f;
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+j < cSim.atoms)
#endif
{
r = sqrtf(r2);
apos.w += getGBVI_Volume( r, ar.x, psA[j].sr );
tempBuffer[threadIdx.x] = getGBVI_Volume( r, psA[j].r, ar.y );
}
// Sum the terms.
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
#endif
}
pos++;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment