/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include
#include
#include
#include
#include
#include
using namespace std;
#include "gputypes.h"
struct Atom
{
float3 rij1;
float3 rij2;
float3 rij3;
float M;
float d2;
float InvMassI;
float rij1sq;
float rij2sq;
float rij3sq;
};
static __constant__ cudaGmxSimulation cSim;
void SetShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyFirstShake_kernel()
{
extern __shared__ Atom sA[];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosqP[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosqP[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosqP[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosqP[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
cSim.pPosqP[atomID.x] = xpi;
cSim.pPosqP[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosqP[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosqP[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
void kApplyFirstShake(gpuContext gpu)
{
// printf("kApplyFirstShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplyFirstShake_kernel<<sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplyFirstShake");
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplySecondShake_kernel()
{
extern __shared__ Atom sA[];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosq[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosq[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosq[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosq[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
cSim.pPosq[atomID.x] = xpi;
cSim.pPosq[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosq[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosq[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
// Update any atoms that SHAKE is not applied to.
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplyNoShake_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
void kApplySecondShake(gpuContext gpu)
{
// printf("kApplySecondShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplySecondShake_kernel<<sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplySecondShake");
}
else if (gpu->sim.NonShakeConstraints > 0)
{
// handle non-Shake atoms
kApplyNoShake_kernel<<sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
LAUNCHERROR("kApplyNoShake");
}
}