Commit 2e451b9d authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted the old CUDA platform

parent 352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float fx;
float fy;
float fz;
float fb;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateGBVIForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateGBVIForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
#include "kCalculateGBVIAux.h"
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVIForces2.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVIForces2.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVIForces2.h"
void kCalculateGBVIForces2(gpuContext gpu)
{
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVIN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
else
kCalculateGBVIN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
break;
case CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVICutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVICutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
case PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVIPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVIPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break;
}
LAUNCHERROR("kCalculateGBVIForces2");
//kPrintGBVI( gpu, "kCalculateGBVIForces2", 0, stderr);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "kCalculateGBVIAux.h"
/**
* This file contains the kernel for evalauating the second stage of GBSA. It is included
* several times in kCalculateGBVIForces2.cu with different #defines to generate
* different versions of the kernels.
*/
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
volatile float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float4 ar = cSim.pGBVIData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Atom I Born forces and sum
float dE = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || x+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float4 temp1 = cSim.pGBVIData[j];
float fb = cSim.pBornForce[j];
sA[threadIdx.x].fb = fb;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
}
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
//else if (flags)
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
float dE = getGBVI_dE2( r, ar.x, psA[tj].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// Atom J Born sum term
dE = getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Interleaved Atom I and J Born Forces and sum components
float dE = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
tempBuffer[threadIdx.x].x = d;
d = dy * dE;
af.y -= d;
tempBuffer[threadIdx.x].y = d;
d = dz * dE;
af.z -= d;
tempBuffer[threadIdx.x].z = d;
// Atom J Born sum term
dE = getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
tempBuffer[threadIdx.x].x += dx;
tempBuffer[threadIdx.x].y += dy;
tempBuffer[threadIdx.x].z += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
}
lasty = y;
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
extern __shared__ Vectors sV[];
static __constant__ cudaGmxSimulation cSim;
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrtf(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define CROSS_PRODUCT(v1, v2, c) \
c.x = v1.y * v2.z - v1.z * v2.y; \
c.y = v1.z * v2.x - v1.x * v2.z; \
c.z = v1.x * v2.y - v1.y * v2.x;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acosf(cosine); \
float deltaIdeal = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
dEdR = param.y * deltaIdeal; \
}
#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acosf(cosine); \
float deltaIdeal = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
dEdR = param.y * deltaIdeal * deltaIdeal; \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
if (dp > 0.99f || dp < -0.99f) { \
float4 cross; \
CROSS_PRODUCT(v1, v2, cross); \
float scale = DOT3(v1, v1)*DOT3(v2, v2); \
angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
if (dp < 0.0f) \
angle = LOCAL_HACK_PI-angle; \
} \
else { \
angle = acosf(dp); \
} \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
cosine = cosf(angle); \
}
void SetCalculateLocalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateLocalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateLocalForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
Vectors* A = &sV[threadIdx.x];
float energy = 0.0f;
while (pos < cSim.bond_offset)
{
if (pos < cSim.bonds)
{
int4 atom = cSim.pBondID[pos];
float4 atomA = cSim.pPosq[atom.x];
float4 atomB = cSim.pPosq[atom.y];
float2 bond = cSim.pBondParameter[pos];
float dx = atomB.x - atomA.x;
float dy = atomB.y - atomA.y;
float dz = atomB.z - atomA.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
float deltaIdeal = r - bond.x;
/* E */ energy += 0.5f * bond.y * deltaIdeal * deltaIdeal;
float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
forceA.x += dx;
forceA.y += dy;
forceA.z += dz;
forceB.x -= dx;
forceB.y -= dy;
forceB.z -= dz;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.bond_angle_offset)
{
unsigned int pos1 = pos - cSim.bond_offset;
if (pos1 < cSim.bond_angles)
{
int4 atom1 = cSim.pBondAngleID1[pos1];
float2 bond_angle = cSim.pBondAngleParameter[pos1];
float4 a1 = cSim.pPosq[atom1.x];
float4 a2 = cSim.pPosq[atom1.y];
float4 a3 = cSim.pPosq[atom1.z];
A->v0.x = a2.x - a1.x;
A->v0.y = a2.y - a1.y;
A->v0.z = a2.z - a1.z;
A->v1.x = a2.x - a3.x;
A->v1.y = a2.y - a3.y;
A->v1.z = a2.z - a3.z;
float3 cp;
CROSS_PRODUCT(A->v0, A->v1, cp);
float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
rp = max(sqrtf(rp), 1.0e-06f);
float r21 = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = max(-1.0f, min(1.0f, dot / sqrtf(r21 * r23)));
float angle_energy;
/* E */ GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
energy += 0.5f*angle_energy;
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
//printf("%11.4f %11.4f\n", cosine, dEdR);
float termA = dEdR / (r21 * rp);
float termC = -dEdR / (r23 * rp);
float3 c21;
float3 c23;
CROSS_PRODUCT(A->v0, cp, c21);
CROSS_PRODUCT(A->v1, cp, c23);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
int2 atom2 = cSim.pBondAngleID2[pos1];
unsigned int offset = atom1.x + atom1.w * cSim.stride;
float4 force = cSim.pForce4[offset];
force.x += c21.x;
force.y += c21.y;
force.z += c21.z;
cSim.pForce4[offset] = force;
offset = atom1.y + atom2.x * cSim.stride;
force = cSim.pForce4[offset];
force.x -= (c21.x + c23.x);
force.y -= (c21.y + c23.y);
force.z -= (c21.z + c23.z);
cSim.pForce4[offset] = force;
offset = atom1.z + atom2.y * cSim.stride;
force = cSim.pForce4[offset];
force.x += c23.x;
force.y += c23.y;
force.z += c23.z;
cSim.pForce4[offset] = force;
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.dihedral_offset)
{
unsigned int pos1 = pos - cSim.bond_angle_offset;
if (pos1 < cSim.dihedrals)
{
int4 atom1 = cSim.pDihedralID1[pos1];
float4 atomA = cSim.pPosq[atom1.x];
float4 atomB = cSim.pPosq[atom1.y];
float4 atomC = cSim.pPosq[atom1.z];
float4 atomD = cSim.pPosq[atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = cSim.pDihedralParameter[pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * LOCAL_HACK_PI / 180.0f);
// ATTENTION: This section leads to a divergent deltaAngle values wrt
// forces and energies. We separate the case dihedral.z = n = 0, which
// is treated by the calculation of energies via a harmonic potential
/* E */ if (dihedral.z) energy += dihedral.x * (1.0f + cosf(deltaAngle));
/* E */ else
{
float deltaAngle = dihedralAngle - dihedral.y;
if (deltaAngle < -LOCAL_HACK_PI) deltaAngle += 2.0f * LOCAL_HACK_PI;
else if (deltaAngle > LOCAL_HACK_PI) deltaAngle -= 2.0f * LOCAL_HACK_PI;
energy += dihedral.x * deltaAngle * deltaAngle;
}
float sinDeltaAngle = sinf(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrtf(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = cSim.pDihedralID2[pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * cSim.stride;
float4 force = cSim.pForce4[offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
cSim.pForce4[offset] = force;
//printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.w + atom2.w * cSim.stride;
force = cSim.pForce4[offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
cSim.pForce4[offset] = force;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * cSim.stride;
force = cSim.pForce4[offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
cSim.pForce4[offset] = force;
//printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.z + atom2.z * cSim.stride;
force = cSim.pForce4[offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
cSim.pForce4[offset] = force;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos += blockDim.x * gridDim.x;
}
// Ryckaert Bellemans dihedrals
while (pos < cSim.rb_dihedral_offset)
{
unsigned int pos1 = pos - cSim.dihedral_offset;
if (pos1 < cSim.rb_dihedrals)
{
int4 atom1 = cSim.pRbDihedralID1[pos1];
float4 atomA = cSim.pPosq[atom1.x];
float4 atomB = cSim.pPosq[atom1.y];
float4 atomC = cSim.pPosq[atom1.z];
float4 atomD = cSim.pPosq[atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle, cosPhi;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += LOCAL_HACK_PI;
}
else
{
dihedralAngle -= LOCAL_HACK_PI;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4 dihedral1 = cSim.pRbDihedralParameter1[pos1];
float2 dihedral2 = cSim.pRbDihedralParameter2[pos1];
float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y;
/* E */ float rb_energy = dihedral1.x;
rb_energy += dihedral1.y * cosFactor;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor;
rb_energy += dihedral1.z * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor;
rb_energy += dihedral1.w * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor;
rb_energy += dihedral2.x * cosFactor;
rb_energy += dihedral2.y * cosFactor * cosPhi;
/* E */ energy += rb_energy;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sinf(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrtf(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = cSim.pRbDihedralID2[pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * cSim.stride;
float4 force = cSim.pForce4[offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
cSim.pForce4[offset] = force;
// printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.w + atom2.w * cSim.stride;
force = cSim.pForce4[offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
cSim.pForce4[offset] = force;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * cSim.stride;
force = cSim.pForce4[offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
cSim.pForce4[offset] = force;
// printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.z + atom2.z * cSim.stride;
force = cSim.pForce4[offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
cSim.pForce4[offset] = force;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrtf(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
/* E */
energy += LJ14.x * (sig6 - 1.0f) * sig6;
energy += LJ14.z * inverseR;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
void kCalculateLocalForces(gpuContext gpu)
{
// printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
LAUNCHERROR("kCalculateLocalForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sum;
float padding;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__
__launch_bounds__(384, 1)
void kReduceObcGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float2 atom = cSim.pObcData[pos];
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius and OBC term.
sum *= 0.5f * atom.x;
float sum2 = sum * sum;
float sum3 = sum * sum2;
float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
float nonOffsetRadii = atom.x + cSim.dielectricOffset;
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos] = bornRadius;
cSim.pObcChain[pos] = obcChain;
pos += gridDim.x * blockDim.x;
}
}
void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu)
{
kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
gpu->bRecalculateBornRadii = false;
LAUNCHERROR("kReduceObcGbsaBornSum");
}
void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log)
{
gpu->psObcData->Download();
gpu->psBornRadii->Download();
gpu->psObcChain->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
gpu->psSigEps2->Download();
(void) fprintf( log, "kPrintObc Cuda bCh bR bF prm[2] sigeps[2]\n" );
(void) fprintf( stderr, "bOutputWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
static_cast<unsigned int>(sizeof(Atom)*gpu->sim.nonbond_threads_per_block) );
for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n", ii,
gpu->psObcChain->_pSysData[ii],
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii],
gpu->psObcData->_pSysData[ii].x,
gpu->psObcData->_pSysData[ii].y,
gpu->psSigEps2->_pSysData[ii].x,
gpu->psSigEps2->_pSysData[ii].y );
}
}
void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
{
// printf("kCalculateObcgbsaBornSum\n");
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
break;
case CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
case PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
}
LAUNCHERROR("kCalculateBornSum");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#if defined USE_CUTOFF
if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && x+j < cSim.atoms )
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[j].sr;
if (ar.x < (rj - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+tj < cSim.atoms)
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[tj].sr;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[tj].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
}
tj = (tj - 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
tempBuffer[threadIdx.x] = 0.0f;
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+j < cSim.atoms)
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[j].sr;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[j].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[j].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[j].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[j].r) - l_ij);
}
tempBuffer[threadIdx.x] = term;
}
}
// Sum the terms.
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
#endif
}
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float fx;
float fy;
float fz;
float fb;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu)
{
//printf("kCalculateObcGbsaForces2\n");
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
break;
case CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
case PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2");
//kPrintObc( gpu, "Post kCalculateObcGbsaForces2", 0, stderr );
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evalauating the second stage of GBSA. It is included
* several times in kCalculateObcGbsaForces2.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
volatile float3* tempBuffer = (volatile float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
volatile Atom* psA = &sA[tbx];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = logf(u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
#if defined USE_PERIODIC
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
#else
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
}
else
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
}
float sr2 = a.y * a.y;
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float l_ijI = 1.0f / max(psA[tj].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = logf(u_ijJ / l_ijJ);
float t1I = logf(u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr * psA[tj].sr * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
#if defined USE_PERIODIC
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
#else
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[tj].fb * term;
float rj = psA[tj].r;
#ifdef USE_PERIODIC
if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
#else
if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[j].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[j].sr));
float l_ijI = 1.0f / max(psA[j].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = logf(u_ijJ / l_ijJ);
float t1I = logf(u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
#if defined USE_PERIODIC
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
#else
if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
tempBuffer[threadIdx.x].x = d;
d = dy * dE;
af.y -= d;
tempBuffer[threadIdx.x].y = d;
d = dz * dE;
af.z -= d;
tempBuffer[threadIdx.x].z = d;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[j].fb * term;
float rj = psA[j].r;
#ifdef USE_PERIODIC
if (rj >= rScaledRadiusI || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#elif defined USE_CUTOFF
if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
#else
if (rj >= rScaledRadiusI)
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
tempBuffer[threadIdx.x].x += dx;
tempBuffer[threadIdx.x].y += dy;
tempBuffer[threadIdx.x].z += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
}
lasty = y;
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "bbsort.h"
#include <cuda.h>
using namespace std;
static __constant__ cudaGmxSimulation cSim;
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
void SetCalculatePMESim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculatePMESim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;
inline __host__ __device__ int fast_mod(int a, int b)
{
return (b & (b - 1)) ? a % b : a & (b - 1);
}
inline __host__ __device__ float4 make_float4(float s)
{
return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float a, float4 b)
{
return make_float4(a + b.x, a + b.y, a + b.z, a + b.w);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
}
inline __host__ __device__ float4 make_float4(int3 a)
{
return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kUpdateBsplines_kernel()
{
unsigned int tnb = blockDim.x * gridDim.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ float4 bsplines_cache[]; // size = 2 * block_size * pme_order
const float4 div_o = make_float4(1.0f/(PME_ORDER - 1));
for (int i = tid; i < cSim.atoms; i += tnb)
{
float4* data = &bsplines_cache[threadIdx.x*PME_ORDER];
float4* ddata = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
for (int j = 0; j < PME_ORDER; j++)
{
data[j] = make_float4(0.0f);
ddata[j] = make_float4(0.0f);
}
float4 posq = cSim.pPosq[i];
posq.x -= floorf(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
posq.y -= floorf(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
posq.z -= floorf(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
(posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
float4 dr = make_float4(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
((int) t.y) % cSim.pmeGridSize.y,
((int) t.z) % cSim.pmeGridSize.z);
cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
data[PME_ORDER - 1] = make_float4(0.0f);
data[1] = dr;
data[0] = make_float4(1.0f) - dr;
for (int j = 3; j < PME_ORDER; j++)
{
float div = 1.0f / ((float)j - 1.0f);
data[j - 1] = div * dr * data[j - 2];
for (int k = 1; k < (j - 1); k++)
{
data[j - k - 1] =
div * (
(dr + float(k)) * data[j - k - 2] +
(-dr + ((float)(j - k))) * data[j - k - 1]);
}
data[0] = div * (- dr + 1) * data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j - 1] - data[j];
data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];
for (int j = 1; j < (PME_ORDER - 1); j++)
{
data[PME_ORDER - j - 1] =
div_o * (
(dr + (float)j) * data[PME_ORDER - j - 2] +
(-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
);
}
data[0] = div_o * (-dr + 1.0f) * data[0];
for (int j = 0; j < PME_ORDER; j++)
{
data[j].w = posq.w; // Storing the charge here improves cache coherency in the charge spreading kernel
cSim.pPmeBsplineTheta[i + j*cSim.atoms] = data[j];
cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
}
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kFindAtomRangeForGrid_kernel()
{
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last)
{
for (int j = last+1; j <= gridIndex; ++j)
cSim.pPmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (thread == blockDim.x*gridDim.x-1)
{
int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
for (int j = last+1; j <= gridSize; ++j)
cSim.pPmeAtomRange[j] = cSim.atoms;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kRecordZIndex_kernel()
{
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
for (int i = start; i < end; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
float posz = cSim.pPosq[atomData.x].z;
posz -= floorf(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
cSim.pPmeAtomGridIndex[i].y = z;
}
}
__global__
void kGridSpreadCharge_kernel()
{
unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
{
int3 gridPoint;
gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
gridPoint.y = remainder/cSim.pmeGridSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
float result = 0.0f;
for (int ix = 0; ix < PME_ORDER; ++ix)
{
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
for (int iy = 0; iy < PME_ORDER; ++iy)
{
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
int firstAtom = cSim.pPmeAtomRange[gridIndex1];
int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
}
if (z1 > gridPoint.z)
{
gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
firstAtom = cSim.pPmeAtomRange[gridIndex1];
lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
}
}
}
}
cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrtf(cSim.epsfac), 0.0f);
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kReciprocalConvolution_kernel()
{
const unsigned int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
float expFactor = LOCAL_HACK_PI*LOCAL_HACK_PI/(cSim.alphaEwald*cSim.alphaEwald);
float scaleFactor = 1.0/(LOCAL_HACK_PI*cSim.periodicBoxSizeX*cSim.periodicBoxSizeY*cSim.periodicBoxSizeZ);
float energy = 0.0f;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
{
int kx = index/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
int remainder = index-kx*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
int ky = remainder/cSim.pmeGridSize.z;
int kz = remainder-ky*cSim.pmeGridSize.z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
int mx = (kx < (cSim.pmeGridSize.x+1)/2) ? kx : (kx-cSim.pmeGridSize.x);
int my = (ky < (cSim.pmeGridSize.y+1)/2) ? ky : (ky-cSim.pmeGridSize.y);
int mz = (kz < (cSim.pmeGridSize.z+1)/2) ? kz : (kz-cSim.pmeGridSize.z);
float mhx = mx*cSim.invPeriodicBoxSizeX;
float mhy = my*cSim.invPeriodicBoxSizeY;
float mhz = mz*cSim.invPeriodicBoxSizeZ;
float bx = cSim.pPmeBsplineModuli[0][kx];
float by = cSim.pPmeBsplineModuli[1][ky];
float bz = cSim.pPmeBsplineModuli[2][kz];
cuComplex grid = cSim.pPmeGrid[index];
float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
float denom = m2*bx*by*bz;
float eterm = scaleFactor*exp(-expFactor*m2)/denom;
cSim.pPmeGrid[index] = make_cuComplex(grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kGridInterpolateForce_kernel()
{
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < cSim.atoms; atom += blockDim.x*gridDim.x)
{
float3 force = make_float3(0.0f, 0.0f, 0.0f);
float4 posq = cSim.pPosq[atom];
posq.x -= floorf(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
posq.y -= floorf(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
posq.z -= floorf(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
(posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
((int) t.y) % cSim.pmeGridSize.y,
((int) t.z) % cSim.pmeGridSize.z);
for (int ix = 0; ix < PME_ORDER; ix++)
{
int xindex = gridIndex.x+ix;
xindex -= (xindex >= cSim.pmeGridSize.x ? cSim.pmeGridSize.x : 0);
float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
for (int iy = 0; iy < PME_ORDER; iy++)
{
int yindex = gridIndex.y+iy;
yindex -= (yindex >= cSim.pmeGridSize.y ? cSim.pmeGridSize.y : 0);
float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
for (int iz = 0; iz < PME_ORDER; iz++)
{
int zindex = gridIndex.z+iz;
zindex -= (zindex >= cSim.pmeGridSize.z ? cSim.pmeGridSize.z : 0);
float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
int index = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;
float gridvalue = cSim.pPmeGrid[index].x;
force.x += dtx*ty*tz*gridvalue;
force.y += tx*dty*tz*gridvalue;
force.z += tx*ty*dtz*gridvalue;
}
}
}
float4 totalForce = cSim.pForce4[atom];
float q = posq.w*sqrtf(cSim.epsfac);
totalForce.x -= q*force.x*cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
totalForce.y -= q*force.y*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
totalForce.z -= q*force.z*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
cSim.pForce4[atom] = totalForce;
}
}
void kCalculatePME(gpuContext gpu)
{
// printf("kCalculatePME\n");
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
LAUNCHERROR("kUpdateBsplines");
bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kFindAtomRangeForGrid");
kRecordZIndex_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kRecordZIndex");
kGridSpreadCharge_kernel<<<16*gpu->sim.blocks, 64>>>();
LAUNCHERROR("kGridSpreadCharge");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kReciprocalConvolution");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
kGridInterpolateForce_kernel<<<2*gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kGridInterpolateForce");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the routine for evaluating a custom expression.
*/
static __constant__ float globalParams[8];
texture<float4, 1, cudaReadModeElementType> texRef0;
texture<float4, 1, cudaReadModeElementType> texRef1;
texture<float4, 1, cudaReadModeElementType> texRef2;
texture<float4, 1, cudaReadModeElementType> texRef3;
#define STACK(y) stack[(y)*blockDim.x+threadIdx.x]
#define VARIABLE(y) variables[(y)*blockDim.x+threadIdx.x]
template<int SIZE>
__device__ float kEvaluateExpression_kernel(Expression<SIZE>* expression, float* stack, float* variables)
{
int stackPointer = -1;
for (int i = 0; i < expression->length; i++)
{
int op = expression->op[i];
if (op < MULTIPLY) {
STACK(++stackPointer) = VARIABLE(op-VARIABLE0);
}
else if (op < NEGATE) {
if (op < MULTIPLY_CONSTANT) {
if (op == MULTIPLY) {
float temp = STACK(stackPointer);
STACK(--stackPointer) *= temp;
}
else if (op == DIVIDE) {
float temp = STACK(stackPointer);
STACK(stackPointer) = temp/STACK(--stackPointer);
}
else if (op == ADD) {
float temp = STACK(stackPointer);
STACK(--stackPointer) += temp;
}
else if (op == SUBTRACT) {
float temp = STACK(stackPointer);
STACK(stackPointer) = temp-STACK(--stackPointer);
}
else /*if (op == POWER)*/ {
float temp = STACK(stackPointer);
STACK(stackPointer) = pow(temp, STACK(--stackPointer));
}
}
else if (op < GLOBAL) {
if (op == MULTIPLY_CONSTANT) {
STACK(stackPointer) *= expression->arg[i];
}
else if (op == POWER_CONSTANT) {
STACK(stackPointer) = pow(STACK(stackPointer), expression->arg[i]);
}
else /*if (op == ADD_CONSTANT)*/ {
STACK(stackPointer) += expression->arg[i];
}
}
else {
if (op == GLOBAL) {
STACK(++stackPointer) = globalParams[(int) expression->arg[i]];
}
else if (op == CONSTANT) {
STACK(++stackPointer) = expression->arg[i];
}
else /*if (op == CUSTOM || op == CUSTOM_DERIV)*/ {
int function = (int) expression->arg[i];
float x = STACK(stackPointer);
float4 params = cSim.pTabulatedFunctionParams[function];
if (x < params.x || x > params.y)
STACK(stackPointer) = 0.0f;
else
{
x = (x-params.x)*params.z;
int index = floor(x);
index = min(index, (int) params.w);
float4 coeff;
if (function == 0)
coeff = tex1Dfetch(texRef0, index);
else if (function == 1)
coeff = tex1Dfetch(texRef1, index);
else if (function == 2)
coeff = tex1Dfetch(texRef2, index);
else
coeff = tex1Dfetch(texRef3, index);
float b = x-index;
float a = 1.0f-b;
if (op == CUSTOM)
STACK(stackPointer) = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);
else
STACK(stackPointer) = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;
}
}
}
}
else {
if (op < SIN) {
if (op == NEGATE) {
STACK(stackPointer) *= -1.0f;
}
else if (op == RECIPROCAL) {
STACK(stackPointer) = 1.0f/STACK(stackPointer);
}
else if (op == SQRT) {
STACK(stackPointer) = sqrt(STACK(stackPointer));
}
else if (op == EXP) {
STACK(stackPointer) = exp(STACK(stackPointer));
}
else if (op == LOG) {
STACK(stackPointer) = log(STACK(stackPointer));
}
else if (op == SQUARE) {
float temp = STACK(stackPointer);
STACK(stackPointer) *= temp;
}
else if (op == CUBE) {
float temp = STACK(stackPointer);
STACK(stackPointer) *= temp*temp;
}
else /*if (op == STEP)*/ {
STACK(stackPointer) = (STACK(stackPointer) >= 0.0f ? 1.0f : 0.0f);
}
}
else {
if (op == SIN) {
STACK(stackPointer) = sin(STACK(stackPointer));
}
else if (op == COS) {
STACK(stackPointer) = cos(STACK(stackPointer));
}
else if (op == SEC) {
STACK(stackPointer) = 1.0f/cos(STACK(stackPointer));
}
else if (op == CSC) {
STACK(stackPointer) = 1.0f/sin(STACK(stackPointer));
}
else if (op == TAN) {
STACK(stackPointer) = tan(STACK(stackPointer));
}
else if (op == COT) {
STACK(stackPointer) = 1.0f/tan(STACK(stackPointer));
}
else if (op == ASIN) {
STACK(stackPointer) = asin(STACK(stackPointer));
}
else if (op == ACOS) {
STACK(stackPointer) = acos(STACK(stackPointer));
}
else if (op == ATAN) {
STACK(stackPointer) = atan(STACK(stackPointer));
}
else if (op == SINH) {
STACK(stackPointer) = sinh(STACK(stackPointer));
}
else if (op == COSH) {
STACK(stackPointer) = cosh(STACK(stackPointer));
}
else if (op == TANH) {
STACK(stackPointer) = tanh(STACK(stackPointer));
}
else if (op == ERF) {
STACK(stackPointer) = erf(STACK(stackPointer));
}
else if (op == ERFC) {
STACK(stackPointer) = erfc(STACK(stackPointer));
}
else if (op == MIN) {
float temp = STACK(stackPointer);
STACK(stackPointer) = min(temp, STACK(--stackPointer));
}
else if (op == MAX) {
float temp = STACK(stackPointer);
STACK(stackPointer) = max(temp, STACK(--stackPointer));
}
else /*if (op == ABS)*/ {
STACK(stackPointer) = fabs(STACK(stackPointer));
}
}
}
}
return STACK(stackPointer);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for identifying interacting blocks. It is included
* several times in kCalculateCDLJForces.cu with different #defines to generate
* different versions of the kernels.
*/
/**
* Find a bounding box for the atoms in each block.
*/
__global__ void METHOD_NAME(kFindBlockBounds, _kernel)()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int base = pos << GRIDBITS;
if (base < cSim.atoms)
{
float4 apos = cSim.pPosq[base];
#ifdef USE_PERIODIC
apos.x -= floorf(apos.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
apos.y -= floorf(apos.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
apos.z -= floorf(apos.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
float4 firstPoint = apos;
#endif
float minx = apos.x;
float maxx = apos.x;
float miny = apos.y;
float maxy = apos.y;
float minz = apos.z;
float maxz = apos.z;
for (unsigned int i = 1; i < GRID; i++)
{
apos = cSim.pPosq[base+i];
#ifdef USE_PERIODIC
apos.x -= floorf((apos.x-firstPoint.x)*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
apos.y -= floorf((apos.y-firstPoint.y)*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
apos.z -= floorf((apos.z-firstPoint.z)*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
minx = min(minx, apos.x);
maxx = max(maxx, apos.x);
miny = min(miny, apos.y);
maxy = max(maxy, apos.y);
minz = min(minz, apos.z);
maxz = max(maxz, apos.z);
}
cSim.pGridBoundingBox[pos] = make_float4(0.5f*(maxx-minx), 0.5f*(maxy-miny), 0.5f*(maxz-minz), 0);
cSim.pGridCenter[pos] = make_float4(0.5f*(maxx+minx), 0.5f*(maxy+miny), 0.5f*(maxz+minz), 0);
}
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.workUnits)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = cSim.pWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff);
x = (x >> 17);
// Find the distance between the bounding boxes of the two cells.
float4 centera = cSim.pGridCenter[x];
float4 centerb = cSim.pGridCenter[y];
float dx = centera.x-centerb.x;
float dy = centera.y-centerb.y;
float dz = centera.z-centerb.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float4 boxSizea = cSim.pGridBoundingBox[x];
float4 boxSizeb = cSim.pGridBoundingBox[y];
dx = max(0.0f, abs(dx)-boxSizea.x-boxSizeb.x);
dy = max(0.0f, abs(dy)-boxSizea.y-boxSizeb.y);
dz = max(0.0f, abs(dz)-boxSizea.z-boxSizeb.z);
cSim.pInteractionFlag[pos] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1);
pos += gridDim.x*blockDim.x;
}
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit)
{
extern __shared__ volatile unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int index = threadIdx.x & (GRID - 1);
unsigned int lasty = 0xFFFFFFFF;
float4 apos;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff);
bool bExclusionFlag = (x & 0x1);
x = (x >> 17);
if (x == y || bExclusionFlag)
{
// Assume this block will be dense.
if (index == 0)
cSim.pInteractionFlag[pos] = 0xFFFFFFFF;
}
else
{
// Load the bounding box for x and the atom positions for y.
float4 center = cSim.pGridCenter[x];
float4 boxSize = cSim.pGridBoundingBox[x];
if (y != lasty)
{
apos = cSim.pPosq[(y<<GRIDBITS)+index];
}
// Find the distance of the atom from the bounding box.
float dx = apos.x-center.x;
float dy = apos.y-center.y;
float dz = apos.z-center.z;
#ifdef USE_PERIODIC
dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
dx = max(0.0f, abs(dx)-boxSize.x);
dy = max(0.0f, abs(dy)-boxSize.y);
dz = max(0.0f, abs(dz)-boxSize.z);
flags[threadIdx.x] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1 << index);
// Sum the flags.
if (index % 2 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1];
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+2];
if (index % 8 == 0)
flags[threadIdx.x] += flags[threadIdx.x+4];
if (index % 16 == 0)
flags[threadIdx.x] += flags[threadIdx.x+8];
if (index == 0)
{
unsigned int allFlags = flags[threadIdx.x] + flags[threadIdx.x+16];
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
cSim.pInteractionFlag[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#define FABS(a) ((a) > 0.0f ? (a) : -(a))
static __constant__ cudaGmxSimulation cSim;
void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetForcesSim copy to cSim failed");
}
void GetForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed");
}
__global__
__launch_bounds__(384, 1)
void kClearForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.outputBuffers)
{
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
}
void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu)
{
// printf("kClearForces\n");
kClearForces_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearForces");
}
__global__
__launch_bounds__(384, 1)
void kClearBornSumAndForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
cSim.pBornSum[pos] = 0.0f;
cSim.pBornForce[pos] = 0.0f;
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
while (pos < cSim.stride * cSim.outputBuffers)
{
cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
}
void kClearBornSumAndForces(gpuContext gpu)
{
// printf("kClearBornSumAndForces\n");
kClearBornSumAndForces_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSumAndForces");
}
__global__
__launch_bounds__(384, 1)
void kClearEnergy_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.energyOutputBuffers)
{
((float*)cSim.pEnergy)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearEnergy(gpuContext gpu)
{
// printf("kClearEnergy\n");
kClearEnergy_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearEnergy");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceBornSumAndForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
// Reduce forces
while (pos < cSim.stride4)
{
float totalForce = 0.0f;
float* pFt = (float*)cSim.pForce4 + pos;
int i = cSim.outputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
float f3 = *pFt;
pFt += cSim.stride4;
float f4 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
pFt = (float*)cSim.pForce4 + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
// Reduce Born Sum
while (pos - cSim.stride4 < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos - cSim.stride4;
float2 atom = cSim.pObcData[pos - cSim.stride4];
// Get summed Born data
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pSt;
pSt += cSim.stride;
float f2 = *pSt;
pSt += cSim.stride;
float f3 = *pSt;
pSt += cSim.stride;
float f4 = *pSt;
pSt += cSim.stride;
sum += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pSt;
pSt += cSim.stride;
float f2 = *pSt;
pSt += cSim.stride;
sum += f1 + f2;
i -= 2;
}
if (i > 0)
{
sum += *pSt;
}
// Now calculate Born radius and OBC term.
cSim.pBornSum[pos - cSim.stride4] = sum;
sum *= 0.5f * atom.x;
float sum2 = sum * sum;
float sum3 = sum * sum2;
float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
float nonOffsetRadii = atom.x + cSim.dielectricOffset;
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos - cSim.stride4] = bornRadius;
cSim.pObcChain[pos - cSim.stride4] = obcChain;
pos += gridDim.x * blockDim.x;
}
}
void kReduceBornSumAndForces(gpuContext gpu)
{
fprintf( stderr, "kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceBornSumAndForces");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
// Reduce forces
while (pos < cSim.stride4)
{
float totalForce = 0.0f;
float* pFt = (float*)cSim.pForce4 + pos;
int i = cSim.outputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
float f3 = *pFt;
pFt += cSim.stride4;
float f4 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
pFt = (float*)cSim.pForce4 + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
}
void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu)
{
// printf("kReduceForces\n");
kReduceForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceForces");
}
double kReduceEnergy(gpuContext gpu)
{
//printf("kReduceEnergy\n");
gpu->psEnergy->Download();
double sum = 0.0;
for (int i = 0; i < static_cast<int>(gpu->sim.energyOutputBuffers); i++){
sum += (*gpu->psEnergy)[i];
}
return sum;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceObcGbsaBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float obcChain = cSim.pObcChain[pos];
float2 obcData = cSim.pObcData[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = powf((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius;
totalForce *= bornRadius * bornRadius * obcChain;
energy += saTerm;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
// correct for surface area factor of -6
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float4 gbviData = cSim.pGBVIData[pos];
float switchDeriv = cSim.pGBVISwitchDerivative[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
float ratio = (gbviData.x/bornRadius);
float ratio3 = ratio*ratio*ratio;
energy -= gbviData.z*ratio3;
totalForce += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
float br2 = bornRadius*bornRadius;
totalForce *= (1.0f/3.0f)*br2*br2*switchDeriv;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
void kReduceObcGbsaBornForces(gpuContext gpu)
{
if( gpu->bIncludeGBSA ){
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceObcGbsaBornForces");
} else if( gpu->bIncludeGBVI ){
kReduceGBVIBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceGBVIBornForces");
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using namespace std;
#include "gputypes.h"
enum {VelScale, ForceScale, NoiseScale, MaxParams};
static __constant__ cudaGmxSimulation cSim;
void SetLangevinUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetLangevinUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels with and with center of mass motion removal.
#include "kLangevinUpdate.h"
#define REMOVE_CM
#include "kLangevinUpdate.h"
void kLangevinUpdatePart1(gpuContext gpu)
{
// printf("kLangevinUpdatePart1\n");
if (gpu->bRemoveCM)
{
kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kLangevinUpdatePart1CM");
gpu->bRemoveCM = false;
}
else
{
kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kLangevinUpdatePart1");
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kLangevinUpdatePart2(gpuContext gpu)
{
// printf("kLangevinUpdatePart2\n");
if (gpu->bCalculateCM)
{
kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kLangevinUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
}
else
{
kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kLangevinUpdatePart2");
}
// Update randoms if necessary
gpu->iterations++;
if (gpu->iterations == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
gpu->iterations = 0;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectLangevinStepSize_kernel(float maxStepSize)
{
// Calculate the error.
extern __shared__ float error[];
__shared__ float params[MaxParams];
error[threadIdx.x] = 0.0f;
unsigned int pos = threadIdx.x;
while (pos < cSim.atoms)
{
float4 force = cSim.pForce4[pos];
float invMass = cSim.pVelm4[pos].w;
error[threadIdx.x] += (force.x*force.x + force.y*force.y + force.z*force.z)*invMass;
pos += blockDim.x * gridDim.x;
}
__syncthreads();
// Sum the errors from all threads.
for (int offset = 1; offset < blockDim.x; offset *= 2)
{
if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
error[threadIdx.x] += error[threadIdx.x+offset];
__syncthreads();
}
if (threadIdx.x == 0)
{
// Select the new step size.
float totalError = sqrt(error[0]/(cSim.atoms*3));
float newStepSize = sqrt(cSim.errorTol/totalError);
float oldStepSize = cSim.pStepSize[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
if (newStepSize > maxStepSize)
newStepSize = maxStepSize;
cSim.pStepSize[0].y = newStepSize;
// Recalculate the integration parameters.
float vscale = exp(-newStepSize/cSim.tau);
float fscale = (1-vscale)*cSim.tau;
float noisescale = sqrt(2*cSim.kT/cSim.tau)*sqrt(0.5f*(1-vscale*vscale)*cSim.tau);
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
}
__syncthreads();
if (threadIdx.x < MaxParams)
cSim.pLangevinParameters[threadIdx.x] = params[threadIdx.x];
}
void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep)
{
// printf("kSelectLangevinStepSize\n");
kSelectLangevinStepSize_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>(maxTimeStep);
LAUNCHERROR("kSelectLangevinStepSize");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSetVelocitiesFromPositions_kernel()
{
float2 stepSize = cSim.pStepSize[0];
double oneOverDt = 2.0/(stepSize.x+stepSize.y);
unsigned int pos = threadIdx.x;
while (pos < cSim.atoms)
{
float4 posq = cSim.pPosq[pos];
float4 posqP = cSim.pPosqP[pos];
float4 velm = cSim.pVelm4[pos];
velm.x = (float) (oneOverDt*posqP.x);
velm.y = (float) (oneOverDt*posqP.y);
velm.z = (float) (oneOverDt*posqP.z);
cSim.pVelm4[pos] = velm;
posq.x += posqP.x;
posq.y += posqP.y;
posq.z += posqP.z;
cSim.pPosq[pos] = posq;
pos += blockDim.x * gridDim.x;
}
}
void kSetVelocitiesFromPositions(gpuContext gpu)
{
// printf("kSetVelocitiesFromPositions\n");
kSetVelocitiesFromPositions_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kSetVelocitiesFromPositions");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
/**
* This file contains the kernels for Langevin integration. It is included
* several times in kLangevinUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
void kLangevinUpdatePart1CM_kernel()
#else
void kLangevinUpdatePart1_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
float vscale = cSim.pLangevinParameters[VelScale];
float fscale = cSim.pLangevinParameters[ForceScale];
float noisescale = cSim.pLangevinParameters[NoiseScale];
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 random4a = cSim.pRandom4[rpos + pos];
float4 force = cSim.pForce4[pos];
float sqrtInvMass = sqrt(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force.x + noisescale*sqrtInvMass*random4a.x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force.y + noisescale*sqrtInvMass*random4a.y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force.z + noisescale*sqrtInvMass*random4a.z;
#ifdef REMOVE_CM
velocity.x -= sCM[0].x;
velocity.y -= sCM[0].y;
velocity.z -= sCM[0].z;
#endif
cSim.pOldPosq[pos] = cSim.pPosq[pos];
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
void kLangevinUpdatePart2CM_kernel()
#else
void kLangevinUpdatePart2_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__shared__ float dt;
if (threadIdx.x == 0)
{
dt = cSim.pStepSize[0].y;
if (pos == 0)
cSim.pStepSize[0].x = dt;
}
__syncthreads();
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
#ifdef REMOVE_CM
float mass = 1.0f / velocity.w;
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
#endif
float4 xPrime = make_float4(dt*velocity.x, dt*velocity.y, dt*velocity.z, cSim.pPosq[pos].w);
cSim.pPosqP[pos] = xPrime;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
#ifdef REMOVE_CM
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
#endif
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
__global__ void kScaleAtomCoordinates_kernel(float scale, int numMolecules, float3 periodicBoxSize, float4* posq, int* moleculeAtoms, int* moleculeStartIndex) {
float3 invPeriodicBoxSize = make_float3(1.0f/periodicBoxSize.x, 1.0f/periodicBoxSize.y, 1.0f/periodicBoxSize.z);
for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numMolecules; index += blockDim.x*gridDim.x) {
int first = moleculeStartIndex[index];
int last = moleculeStartIndex[index+1];
int numAtoms = last-first;
// Find the center of each molecule.
float3 center = make_float3(0, 0, 0);
for (int atom = first; atom < last; atom++) {
float4 pos = posq[moleculeAtoms[atom]];
center.x += pos.x;
center.y += pos.y;
center.z += pos.z;
}
center.x /= (float) numAtoms;
center.y /= (float) numAtoms;
center.z /= (float) numAtoms;
// Move it into the first periodic box.
int xcell = (int) floorf(center.x*invPeriodicBoxSize.x);
int ycell = (int) floorf(center.y*invPeriodicBoxSize.y);
int zcell = (int) floorf(center.z*invPeriodicBoxSize.z);
float3 delta = make_float3(xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z);
center.x -= delta.x;
center.y -= delta.y;
center.z -= delta.z;
// Now scale the position of the molecule center.
delta.x = center.x*(scale-1)-delta.x;
delta.y = center.y*(scale-1)-delta.y;
delta.z = center.z*(scale-1)-delta.z;
for (int atom = first; atom < last; atom++) {
float4 pos = posq[moleculeAtoms[atom]];
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
posq[moleculeAtoms[atom]] = pos;
}
}
}
void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex)
{
// printf("kScaleAtomCoordinates\n");
kScaleAtomCoordinates_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(scale, moleculeStartIndex._length-1,
make_float3(gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ), gpu->sim.pPosq,
moleculeAtoms._pDevData, moleculeStartIndex._pDevData);
LAUNCHERROR("kScaleAtomCoordinates");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetRandomSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetRandomSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
extern __shared__ float3 sRand[];
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
#endif
void kGenerateRandoms_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int increment = blockDim.x * gridDim.x;
// Read generator state
uint4 state = cSim.pRandomSeed[pos];
unsigned int carry = 0;
float4 random4;
float2 random2;
while (pos < cSim.totalRandoms)
{
// Generate 6 randoms in GRF
unsigned int pos1 = threadIdx.x;
for (int i = 0; i < 2; i++)
{
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = sqrtf(-2.0f * logf(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
sRand[pos1].x = x1 * cosf(2.0f * 3.14159265f * x2);
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = sqrtf(-2.0f * logf(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
sRand[pos1].y = x3 * cosf(2.0f * 3.14159265f * x4);
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x5 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x5 = sqrtf(-2.0f * logf(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
sRand[pos1].z = x5 * cosf(2.0f * 3.14159265f * x6);
pos1 += blockDim.x;
}
// Output final randoms
random4.x = sRand[threadIdx.x].x;
random4.y = sRand[threadIdx.x].y;
random4.z = sRand[threadIdx.x].z;
random4.w = sRand[threadIdx.x + blockDim.x].x;
cSim.pRandom4[pos] = random4;
random2.x = sRand[threadIdx.x + blockDim.x].y;
random2.y = sRand[threadIdx.x + blockDim.x].z;
cSim.pRandom2[pos] = random2;
pos += increment;
}
// Write generator state
pos = blockIdx.x * blockDim.x + threadIdx.x;
cSim.pRandomSeed[pos] = state;
}
void kGenerateRandoms(gpuContext gpu)
{
kGenerateRandoms_kernel<<<gpu->sim.blocks, gpu->sim.random_threads_per_block, gpu->sim.random_threads_per_block * 2 * sizeof(float3)>>>();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetSettleSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetSettleSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
/**
* This is based on the setlep FORTRAN routine by Shuichi Miyamoto. See
* S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplySettle_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints)
{
// Load data.
int4 atomID = cSim.pSettleID[pos];
float2 params = cSim.pSettleParameter[pos];
float4 apos0 = cSim.pOldPosq[atomID.x];
float4 xp0 = cSim.pPosqP[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xp1 = cSim.pPosqP[atomID.y];
float4 apos2 = cSim.pOldPosq[atomID.z];
float4 xp2 = cSim.pPosqP[atomID.z];
float m0 = 1.0f/cSim.pVelm4[atomID.x].w;
float m1 = 1.0f/cSim.pVelm4[atomID.y].w;
float m2 = 1.0f/cSim.pVelm4[atomID.z].w;
// Translate the molecule to the origin to improve numerical precision.
float3 center = make_float3(apos0.x, apos0.y, apos0.z);
apos0.x -= center.x;
apos0.y -= center.y;
apos0.z -= center.z;
apos1.x -= center.x;
apos1.y -= center.y;
apos1.z -= center.z;
apos2.x -= center.x;
apos2.y -= center.y;
apos2.z -= center.z;
// Apply the SETTLE algorithm.
float xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z;
float totalMass = m0+m1+m2;
float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
float xa1 = apos0.x + xp0.x - xcom;
float ya1 = apos0.y + xp0.y - ycom;
float za1 = apos0.z + xp0.z - zcom;
float xb1 = apos1.x + xp1.x - xcom;
float yb1 = apos1.y + xp1.y - ycom;
float zb1 = apos1.z + xp1.z - zcom;
float xc1 = apos2.x + xp2.x - xcom;
float yc1 = apos2.y + xp2.y - ycom;
float zc1 = apos2.z + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrtf(xaksXd * xaksXd + yaksXd * yaksXd + zaksXd * zaksXd);
float aylng = sqrtf(xaksYd * xaksYd + yaksYd * yaksYd + zaksYd * zaksYd);
float azlng = sqrtf(xaksZd * xaksZd + yaksZd * yaksZd + zaksZd * zaksZd);
float trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' ---
float rc = 0.5f*params.y;
float rb = sqrtf(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)/totalMass;
rb -= ra;
float sinphi = za1d / ra;
float cosphi = sqrtf(1.0f - sinphi*sinphi);
float sinpsi = ( zb1d - zc1d ) / (2*rc*cosphi);
float cospsi = sqrtf(1.0f - sinpsi*sinpsi);
float ya2d = ra * cosphi;
float xb2d = - rc * cospsi;
float yb2d = - rb * cosphi - rc *sinpsi * sinphi;
float yc2d = - rb * cosphi + rc *sinpsi * sinphi;
float xb2d2 = xb2d * xb2d;
float hh2 = 4.0f * xb2d2 + (yb2d-yc2d) * (yb2d-yc2d) + (zb1d-zc1d) * (zb1d-zc1d);
float deltx = 2.0f * xb2d + sqrtf( 4.0f * xb2d2 - hh2 + params.y*params.y );
xb2d -= deltx * 0.5f;
// --- Step3 al,be,ga ---
float alpa = ( xb2d * (xb0d-xc0d) + yb0d * yb2d + yc0d * yc2d );
float beta = ( xb2d * (yc0d-yb0d) + xb0d * yb2d + xc0d * yc2d );
float gama = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
float al2be2 = alpa * alpa + beta * beta;
float sinthe = ( alpa*gama - beta * sqrtf( al2be2 - gama * gama ) ) / al2be2;
// --- Step4 A3' ---
float costhe = sqrtf(1.0f - sinthe * sinthe );
float xa3d = - ya2d * sinthe;
float ya3d = ya2d * costhe;
float za3d = za1d;
float xb3d = xb2d * costhe - yb2d * sinthe;
float yb3d = xb2d * sinthe + yb2d * costhe;
float zb3d = zb1d;
float xc3d = - xb2d * costhe - yc2d * sinthe;
float yc3d = - xb2d * sinthe + yc2d * costhe;
float zc3d = zc1d;
// --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3 - apos0.x;
xp0.y = ycom + ya3 - apos0.y;
xp0.z = zcom + za3 - apos0.z;
xp1.x = xcom + xb3 - apos1.x;
xp1.y = ycom + yb3 - apos1.y;
xp1.z = zcom + zb3 - apos1.z;
xp2.x = xcom + xc3 - apos2.x;
xp2.y = ycom + yc3 - apos2.y;
xp2.z = zcom + zc3 - apos2.z;
cSim.pPosqP[atomID.x] = xp0;
cSim.pPosqP[atomID.y] = xp1;
cSim.pPosqP[atomID.z] = xp2;
pos += blockDim.x * gridDim.x;
}
}
void kApplySettle(gpuContext gpu)
{
// printf("kApplySettle\n");
if (gpu->sim.settleConstraints > 0)
{
kApplySettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
LAUNCHERROR("kApplySettle");
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using namespace std;
#include "gputypes.h"
struct Atom
{
float3 rij1;
float3 rij2;
float3 rij3;
float M;
float d2;
float InvMassI;
float rij1sq;
float rij2sq;
float rij3sq;
};
static __constant__ cudaGmxSimulation cSim;
void SetShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyShake_kernel()
{
extern __shared__ Atom sA[];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosqP[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosqP[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosqP[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosqP[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
cSim.pPosqP[atomID.x] = xpi;
cSim.pPosqP[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosqP[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosqP[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
void kApplyShake(gpuContext gpu)
{
// printf("kApplyShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplyShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplyShake");
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetVerletUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetVerletUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels with and with center of mass motion removal.
#include "kVerletUpdate.h"
#define REMOVE_CM
#include "kVerletUpdate.h"
void kVerletUpdatePart1(gpuContext gpu)
{
// printf("kVerletUpdatePart1\n");
if (gpu->bRemoveCM)
{
kVerletUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kVerletUpdatePart1CM");
gpu->bRemoveCM = false;
}
else
{
kVerletUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kVerletUpdatePart1");
}
}
void kVerletUpdatePart2(gpuContext gpu)
{
// printf("kVerletUpdatePart2\n");
if (gpu->bCalculateCM)
{
kVerletUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kVerletUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
}
else
{
kVerletUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kVerletUpdatePart2");
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectVerletStepSize_kernel(float maxStepSize)
{
// Calculate the error.
extern __shared__ float error[];
error[threadIdx.x] = 0.0f;
unsigned int pos = threadIdx.x;
while (pos < cSim.atoms)
{
float4 force = cSim.pForce4[pos];
float invMass = cSim.pVelm4[pos].w;
error[threadIdx.x] += (force.x*force.x + force.y*force.y + force.z*force.z)*invMass;
pos += blockDim.x * gridDim.x;
}
__syncthreads();
// Sum the errors from all threads.
for (int offset = 1; offset < blockDim.x; offset *= 2)
{
if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
error[threadIdx.x] += error[threadIdx.x+offset];
__syncthreads();
}
if (threadIdx.x == 0)
{
float totalError = sqrt(error[0]/(cSim.atoms*3));
float newStepSize = sqrt(cSim.errorTol/totalError);
float oldStepSize = cSim.pStepSize[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
if (newStepSize > maxStepSize)
newStepSize = maxStepSize;
cSim.pStepSize[0].y = newStepSize;
}
}
void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep)
{
// printf("kSelectVerletStepSize\n");
kSelectVerletStepSize_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>(maxTimeStep);
LAUNCHERROR("kSelectVerletStepSize");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for Verlet integration. It is included
* several times in kVerletUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
void kVerletUpdatePart1CM_kernel()
#else
void kVerletUpdatePart1_kernel()
#endif
{
// Load the step size to take.
__shared__ volatile float dtPos;
__shared__ volatile float dtVel;
if (threadIdx.x == 0)
{
float2 stepSize = cSim.pStepSize[0];
dtPos = stepSize.y;
dtVel = 0.5f*(stepSize.x+stepSize.y);
}
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
#else
__syncthreads();
#endif
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = dtVel*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x;
velocity.y += dtOverMass*force.y;
velocity.z += dtOverMass*force.z;
#ifdef REMOVE_CM
velocity.x -= sCM[0].x;
velocity.y -= sCM[0].y;
velocity.z -= sCM[0].z;
#endif
apos.x = velocity.x*dtPos;
apos.y = velocity.y*dtPos;
apos.z = velocity.z*dtPos;
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM
void kVerletUpdatePart2CM_kernel()
#else
void kVerletUpdatePart2_kernel()
#endif
{
// Load the step size to take.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ double oneOverDeltaT;
if (threadIdx.x == 0)
{
float dt = cSim.pStepSize[0].y;
oneOverDeltaT = 1.0/dt;
if (pos == 0)
cSim.pStepSize[0].x = dt;
}
__syncthreads();
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = {0.0f, 0.0f, 0.0f};
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
velocity.x = (float) (oneOverDeltaT*(double)xPrime.x);
velocity.y = (float) (oneOverDeltaT*(double)xPrime.y);
velocity.z = (float) (oneOverDeltaT*(double)xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#ifdef REMOVE_CM
float mass = 1.0f / velocity.w;
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
#endif
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
#ifdef REMOVE_CM
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
#endif
}
//________________________________________________________________________
// See the header file rng.h for a description of the contents of this
// file as well as references and credits.
#include "rng.h"
#include <cmath>
#include <iostream>
inline ulong32 ULONG32(slong x) { return (ulong32(x)); }
inline ulong32 ULONG32(ulong32 x) { return (ulong32(x)); }
inline ulong32 ULONG32(double x) { return (ulong32(x)); }
static const double PI = 3.1415926535897932;
//________________________________________________________________________
// Initialize the static component of RNG
ulong32 RNG::tm = 1234567;
ulong32 RNG::kn[128], RNG::ke[256];
double RNG::wn[128], RNG::fn[128], RNG::we[256], RNG::fe[256];
//________________________________________________________________________
// RNG::RNOR generates normal variates with rejection.
// nfix() generates variates after rejection in RNOR.
// Despite rejection, this method is faster than Box-Muller.
double RNG::nfix(slong h, ulong32 i)
{
const double r = 3.442620; // The starting of the right tail
double x, y;
for(;;) {
x = h * wn[i];
// If i == 0, handle the base strip
if (i == 0) {
do {
x = -log(rand_open01()) * 0.2904764; // .2904764 is 1/r
y = -log(rand_open01());
} while (y + y < x * x);
return ((h > 0) ? r + x : -r - x);
}
// If i > 0, handle the wedges of other strips
if (fn[i] + rand_open01() * (fn[i - 1] - fn[i]) < exp(-.5 * x * x))
return x;
// start all over
h = UL32toSL32(rand_int32());
i = h & 127;
if (ULONG32(std::abs(h)) < kn[i])
return (h * wn[i]);
}
} // RNG::nfix
// __________________________________________________________________________
// RNG::REXP generates exponential variates with rejection.
// efix() generates variates after rejection in REXP.
double RNG::efix(ulong32 j, ulong32 i)
{
for (;;) {
if (i == 0)
return (7.69711 - log(rand_open01()));
const double x = j * we[i];
if (fe[i] + rand_open01() * (fe[i - 1] - fe[i]) < exp(-x))
return x;
j = rand_int32();
i = (j & 255);
if (j < ke[i])
return (j * we[i]);
}
} // RNG::efix
// __________________________________________________________________________
// This procedure creates the tables used by RNOR and REXP
void RNG::zigset()
{
static bool inited = 0;
if (inited)
return;
inited = 1;
// Set up tables for RNOR
const double m1 = 2147483648.0; // 2^31
const double vn = 9.91256303526217e-3;
double tn = 3.442619855899;
double q = vn / exp(-.5 * tn * tn);
kn[0] = ULONG32((tn / q) * m1); kn[1] = 0;
wn[0] = q / m1; wn[127] = tn / m1;
fn[0]=1.; fn[127] = exp(-.5 * tn * tn);
for (uint i = 126; i > 0; i--) {
const double dn = sqrtf(-2 * log(vn / tn + exp(-.5 * tn * tn)));
kn[i + 1] = ULONG32((dn / tn) * m1);
fn[i] = exp(-.5 * dn * dn);
wn[i] = dn / m1;
tn = dn;
}
// Set up tables for REXP
const double m2 = 4294967296.0; // 2^32
const double ve = 3.949659822581572e-3;
double te = 7.697117470131487;
q = ve / exp(-te);
ke[0] = ULONG32((te / q) * m2); ke[1] = 0;
we[0] = q / m2; we[255] = te / m2;
fe[0] = 1.; fe[255] = exp(-te);
for (uint i = 254; i > 0; i--) {
const double de = -log(ve / te + exp(-te));
ke[i+1] = ULONG32((de / te) * m2);
fe[i] = exp(-de);
we[i] = de / m2;
te = de;
}
} // RNG::zigset
// __________________________________________________________________________
// Generate a gamma variate with parameters 'shape' and 'scale'
double RNG::gamma(double shape, double scale)
{
if (shape < 1.)
return gamma(shape + 1., scale) * pow(rand_open01(), 1.0 / shape);
const double d = shape - 1. / 3.;
const double c = 1. / sqrtf(9. * d);
double x, v;
for (;;) {
do {
x = RNOR();
v = 1.0 + c * x;
} while (v <= 0.0);
v = v * v * v;
const double u = rand_open01();
const double x2 = x * x;
if (u < 1.0 - 0.0331 * x2 * x2)
return (d * v / scale);
if (log(u) < 0.5 * x2 + d * (1.0 - v + log(v)))
return (d * v / scale);
}
} // RNG::gamma
// __________________________________________________________________________
// Generate a Poisson variate
// Code essentially copied from R source that is essentially
// ACM Algorithm 599 KPOISS converted to C.
int RNG::poisson(double mu)
{
const double a0=-0.5, a1= 0.3333333, a2=-0.2500068, a3= 0.2000118,
a4=-0.1661269, a5= 0.1421878, a6=-0.1384794, a7= 0.1250060;
const double one_7 = 1.0 / 7.0, one_12 = 1.0 / 12.0, one_24 = 1.0 / 24.0;
const double fact[10] =
{ 1., 1., 2., 6., 24., 120., 720., 5040., 40320., 362880. };
static int l, m;
static double b1, b2, c, c0, c1, c2, c3;
static double pp[36], p0, p, q, s, d, omega;
static double big_l;/* integer "w/o overflow" */
static double muprev = 0., muprev2 = 0.;/*, muold = 0.*/
double del, difmuk= 0., E= 0., fk= 0., fx, fy, g, px, py, t, u= 0., v, x;
int pois = -1;
int k, kflag, big_mu, new_big_mu = 0;
if (mu <= 0.)
return 0;
big_mu = mu >= 10.;
if(big_mu)
new_big_mu = 0;
if (!(big_mu && mu == muprev)) {
if (big_mu) {
new_big_mu = 1;
muprev = mu;
s = sqrtf(mu);
d = 6. * mu * mu;
big_l = floor(mu - 1.1484);
}
else {
if (mu != muprev) {
muprev = mu;
m = (int) ((1.0 < mu) ? mu : 1.0);
l = 0; /* pp[] is already ok up to pp[l] */
q = p0 = p = exp(-mu);
}
for (;;) {
u = rand_open01();
if (u <= p0)
return 0;
if (l != 0) {
for (k = (u <= 0.458) ? 1 : ((l < m) ? l : m); k <= l; k++)
if (u <= pp[k])
return k;
if (l == 35) /* u > pp[35] */
continue;
}
l++;
for (k = l; k <= 35; k++) {
p *= mu / k;
q += p;
pp[k] = q;
if (u <= q) {
l = k;
return k;
}
}
l = 35;
}
}
}
g = mu + s * RNOR();
if (g >= 0.) {
pois = int(g);
if (pois >= big_l)
return pois;
fk = pois;
difmuk = mu - fk;
u = rand_open01();
if (d * u >= difmuk * difmuk * difmuk)
return pois;
}
if (new_big_mu || mu != muprev2) {
muprev2 = mu;
omega = 1.0 / (sqrtf(2.0 * PI) * s);
b1 = one_24 / mu;
b2 = 0.3 * b1 * b1;
c3 = one_7 * b1 * b2;
c2 = b2 - 15. * c3;
c1 = b1 - 6. * b2 + 45. * c3;
c0 = 1. - b1 + 3. * b2 - 15. * c3;
c = 0.1069 / mu;
}
if (g >= 0.) {
kflag = 0;
goto Step_F;
}
for (;;) {
E = REXP();
u = 2 * rand_open01() - 1.;
t = 1.8 + ((u > 0) ? std::abs(E) : -std::abs(E));
if (t > -0.6744) {
pois = int(mu + s * t);
fk = pois;
difmuk = mu - fk;
kflag = 1;
Step_F:
if (pois < 10) {
px = -mu;
py = pow(mu, pois) / fact[pois];
}
else {
del = one_12 / fk;
del = del * (1. - 4.8 * del * del);
v = difmuk / fk;
if (std::abs(v) <= 0.25)
px = fk * v * v * (((((((a7 * v + a6) * v + a5) * v + a4) *
v + a3) * v + a2) * v + a1) * v + a0)
- del;
else
px = fk * log(1. + v) - difmuk - del;
py = 1.0 / (sqrtf(2.0 * PI) * sqrtf(fk));
}
x = (0.5 - difmuk) / s;
x *= x;/* x^2 */
fx = -0.5 * x;
fy = omega * (((c3 * x + c2) * x + c1) * x + c0);
if (kflag > 0) {
if (c * std::abs(u) <= py * exp(px + E) - fy * exp(fx + E))
break;
} else
if (fy - u * fy <= py * exp(px - fx))
break;
}
}
return pois;
} // RNG::poisson
// __________________________________________________________________________
// Generate a binomial variate
// Code essentially copied from R source that is essentially
// ACM Algorithm 678 BTPEC converted to C.
int RNG::binomial(double pp, int n)
{
static double c, fm, npq, p1, p2, p3, p4, qn, xl, xll, xlr, xm, xr;
static double psave = -1.0;
static int nsave = -1, m = 0;
double f, x;
if (n <= 0 || pp <= 0.) return 0;
if (pp >= 1.) return n;
const double p = (pp < 1. - pp) ? pp : 1. - pp;
const double q = 1. - p;
const double np = n * p;
const double r = p / q;
const double g = r * (n + 1);
if (pp != psave || n != nsave) {
psave = pp;
nsave = n;
if (np < 30.0) {
qn = pow(q, (double) n);
goto L_np_small;
} else {
double ffm = np + p;
m = (int) ffm;
fm = m;
npq = np * q;
p1 = (int)(2.195 * sqrtf(npq) - 4.6 * q) + 0.5;
xm = fm + 0.5;
xl = xm - p1;
xr = xm + p1;
c = 0.134 + 20.5 / (15.3 + fm);
double al = (ffm - xl) / (ffm - xl * p);
xll = al * (1.0 + 0.5 * al);
al = (xr - ffm) / (xr * q);
xlr = al * (1.0 + 0.5 * al);
p2 = p1 * (1.0 + c + c);
p3 = p2 + c / xll;
p4 = p3 + c / xlr;
}
} else if (n == nsave) {
if (np < 30.0)
goto L_np_small;
}
int i, ix;
for (;;) {
double u = rand_open01() * p4;
double v = rand_open01();
if (u <= p1) {
ix = (int) (xm - p1 * v + u);
goto finish;
}
if (u <= p2) {
x = xl + (u - p1) / c;
v = v * c + 1.0 - std::abs(xm - x) / p1;
if (v > 1.0 || v <= 0.)
continue;
ix = (int) x;
} else {
if (u > p3) {
ix = (int) (xr - log(v) / xlr);
if (ix > n)
continue;
v *= (u - p3) * xlr;
} else {/* left tail */
ix = (int) (xl + log(v) / xll);
if (ix < 0)
continue;
v *= (u - p2) * xll;
}
}
int k = std::abs(ix - m);
if (k <= 20 || k >= npq / 2 - 1) {
f = 1.0;
if (m < ix) {
for (i = m + 1; i <= ix; i++)
f *= (g / i - r);
} else if (m != ix) {
for (i = ix + 1; i <= m; i++)
f /= (g / i - r);
}
if (v <= f)
goto finish;
} else {
double amaxp, ynorm, alv;
amaxp = (k / npq) * ((k * (k / 3. + 0.625) + 0.1666666666666) /
npq + 0.5);
ynorm = -k * k / (2.0 * npq);
alv = log(v);
if (alv < ynorm - amaxp)
goto finish;
if (alv <= ynorm + amaxp) {
double x1, f1, z, w, z2, x2, f2, w2;
x1 = ix + 1;
f1 = fm + 1.0;
z = n + 1 - fm;
w = n - ix + 1.0;
z2 = z * z;
x2 = x1 * x1;
f2 = f1 * f1;
w2 = w * w;
if (alv <= xm * log(f1 / x1) + (n - m + 0.5) * log(z / w) +
(ix - m) * log(w * p / (x1 * q)) + (13860.0 - (462.0 -
(132.0 - (99.0 - 140.0 / f2) / f2) / f2) / f2) / f1 / 166320.0 +
(13860.0 - (462.0 - (132.0 - (99.0 - 140.0 / z2) / z2) / z2) / z2) /
z / 166320.0 + (13860.0 - (462.0 - (132.0 - (99.0 - 140.0 / x2) /
x2) / x2) / x2) / x1 / 166320.0 + (13860.0 - (462.0 - (132.0 -
(99.0 - 140.0 / w2) / w2) / w2) / w2) / w / 166320.)
goto finish;
}
}
}
L_np_small:
for (;;) {
ix = 0;
f = qn;
double u = rand_open01();
for (;;) {
if (u < f)
goto finish;
if (ix > 110)
break;
u -= f;
ix++;
f *= (g / ix - r);
}
}
finish:
if (psave > 0.5)
ix = n - ix;
return ix;
} // RNG::binomial
// __________________________________________________________________________
// Generate a sample of size 'n' from a multinomial distribution with
// probabilities given in 'probs'. Inspired by R source code.
void RNG::multinom(uint n, const vector<double>& probs, vector<uint>& samp)
{
samp.resize(probs.size());
RNG::multinom(n, &probs[0], (uint) probs.size(), &samp[0]);
}
void RNG::multinom(uint size, const double* probs, uint num_probs, uint* samp)
{
if (num_probs == 0) return;
for (uint i = 0; i < num_probs; i++) samp[i] = 0;
if (size == 0) return;
vector<double> fixed_probs(num_probs);
double total_prob = 0.;
for (uint i = 0; i < num_probs; i++) {
const double pp = probs[i];
//if (std::isfinite(pp) && pp >= 0)
if ((pp == pp) && pp >= 0)
total_prob += (fixed_probs[i] = pp);
}
if (total_prob == 0.) return;
for (uint i = 0; i < num_probs-1; i++) {
if (fixed_probs[i] > 0.) {
samp[i] = binomial(fixed_probs[i] / total_prob, size);
size -= samp[i];
}
if (size == 0) return;
total_prob -= fixed_probs[i];
}
samp[num_probs - 1] = size;
} // RNG::multinomial
// __________________________________________________________________________
// rng.C
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment