/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include
#include
#include
#include "gputypes.h"
using namespace std;
static __constant__ cudaGmxSimulation cSim;
void SetCCMASim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCCMASim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintDirections()
{
// Calculate the direction of each constraint.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
int2 atoms = cSim.pCcmaAtoms[index];
float4 dir = cSim.pCcmaDistance[index];
float4 oldPos1 = cSim.pOldPosq[atoms.x];
float4 oldPos2 = cSim.pOldPosq[atoms.y];
dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z;
cSim.pCcmaDistance[index] = dir;
}
// Mark that no blocks have converged yet.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
cSim.pCcmaConverged[index] = 0;
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
{
if (cSim.pCcmaConverged[blockIdx.x])
return; // The constraint iteration has already converged.
extern __shared__ int convergedBuffer[];
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
int threadConverged = 1;
// Calculate the constraint force for each constraint.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
int2 atoms = cSim.pCcmaAtoms[index];
float4 delta1 = atomPositions[atoms.x];
float4 delta2 = atomPositions[atoms.y];
float4 dir = cSim.pCcmaDistance[index];
float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
if (addOldPosition)
{
rp_ij.x += dir.x;
rp_ij.y += dir.y;
rp_ij.z += dir.z;
}
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w;
float diff = dist2 - rp2;
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
float reducedMass = cSim.pCcmaReducedMass[index];
cSim.pCcmaDelta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
// See whether it has converged.
threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2);
}
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer[threadIdx.x] = threadConverged;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kMultiplyByCCMAConstraintMatrix()
{
extern __shared__ int convergedBuffer[];
// First see whether all work groups have converged.
convergedBuffer[threadIdx.x] = true;
for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
__syncthreads();
for (int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
if (cSim.pCcmaConverged[0])
return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
{
float sum = 0.0f;
for (unsigned int i = 0; ; i++)
{
unsigned int element = index+i*cSim.ccmaConstraints;
unsigned int column = cSim.pConstraintMatrixColumn[element];
if (column >= cSim.ccmaConstraints)
break;
sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[element];
}
cSim.pCcmaDelta2[index] = sum;
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
{
if (cSim.pCcmaConverged[blockIdx.x])
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f);
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
{
float4 atomPos = atomPositions[index];
float invMass = cSim.pVelm4[index].w;
int num = cSim.pCcmaNumAtomConstraints[index];
for (int i = 0; i < num; i++)
{
int constraint = cSim.pCcmaAtomConstraints[index+i*cSim.atoms];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = cSim.pCcmaDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z;
}
atomPositions[index] = atomPos;
}
}
void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{
kComputeCCMAConstraintDirections<<sim.blocks, gpu->sim.ccma_threads_per_block>>>();
LAUNCHERROR("kComputeCCMAConstraintDirections");
for (int i = 0; i < 150; i++) {
kComputeCCMAConstraintForces<<sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
gpu->psCcmaConverged->Download();
kMultiplyByCCMAConstraintMatrix<<sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
if ((*gpu->psCcmaConverged)[0])
break;
kUpdateCCMAAtomPositions<<sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
}
}
void kApplyFirstCCMA(gpuContext gpu)
{
// printf("kApplyFirstCCMA\n");
if (gpu->sim.ccmaConstraints > 0)
kApplyCCMA(gpu, gpu->sim.pPosqP, true);
}
void kApplySecondCCMA(gpuContext gpu)
{
// printf("kApplySecondCCMA\n");
if (gpu->sim.ccmaConstraints > 0)
kApplyCCMA(gpu, gpu->sim.pPosq, false);
}