/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include #include #include #include "jama_svd.h" #include "gputypes.h" using namespace std; using TNT::Array2D; using JAMA::SVD; static __constant__ cudaGmxSimulation cSim; void SetCShakeSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); } void GetCShakeSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); } /** * Synchronize all threads across all blocks. */ __device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount) { __syncthreads(); if (threadIdx.x == 0) syncCounter[blockIdx.x] = newCount; if (threadIdx.x < gridDim.x) { volatile short* counter = &syncCounter[threadIdx.x]; do { } while (*counter != newCount); } __syncthreads(); } __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) { extern __shared__ float temp[]; // Initialize counters used for monitoring convergence and doing global thread synchronization. __shared__ unsigned int requiredIterations; if (threadIdx.x == 0) { requiredIterations = 0; cSim.pSyncCounter[gridDim.x+blockIdx.x] = -1; cSim.pSyncCounter[2*gridDim.x+blockIdx.x] = -1; cSim.pRequiredIterations[0] = 0; } // Calculate the direction of each constraint. unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.lincsConstraints) { int2 atoms = cSim.pLincsAtoms[pos]; float4 dir = cSim.pLincsDistance[pos]; float4 oldPos1 = cSim.pOldPosq[atoms.x]; float4 oldPos2 = cSim.pOldPosq[atoms.y]; dir.x = oldPos1.x-oldPos2.x; dir.y = oldPos1.y-oldPos2.y; dir.z = oldPos1.z-oldPos2.z; cSim.pLincsDistance[pos] = dir; pos += blockDim.x*gridDim.x; } __syncthreads(); // Iteratively update the atom positions. unsigned int maxIterations = 150; float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; for (unsigned int iteration = 0; iteration < maxIterations && iteration == requiredIterations; iteration++) { // Calculate the constraint force for each constraint. pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.lincsConstraints) { int2 atoms = cSim.pLincsAtoms[pos]; float4 delta1 = atomPositions[atoms.x]; float4 delta2 = atomPositions[atoms.y]; float4 dir = cSim.pLincsDistance[pos]; float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z); if (addOldPosition) { rp_ij.x += dir.x; rp_ij.y += dir.y; rp_ij.z += dir.z; } float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z; float dist2 = dir.w*dir.w; float diff = dist2 - rp2; float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z; float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z; float reducedMass = cSim.pShakeReducedMass[pos]; cSim.pLincsSolution[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f); if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) requiredIterations = iteration+1; pos += blockDim.x * gridDim.x; } kSyncAllThreads_kernel(cSim.pSyncCounter, iteration); if (threadIdx.x == 0 && requiredIterations > iteration) cSim.pRequiredIterations[0] = requiredIterations; // Multiply by the inverse constraint matrix for each rigid cluster. if (cSim.rigidClusters > 0) { pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int block = pos/cSim.clusterShakeBlockSize; unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize; while (block < cSim.rigidClusters) { unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block]; unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex; if (indexInBlock < blockSize) { // Load the constraint forces and matrix. unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock]; temp[threadIdx.x] = cSim.pLincsSolution[constraint]; unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block]; // Multiply by the matrix. float sum = 0.0f; for (unsigned int i = 0; i < blockSize; i++) sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock]; cSim.pLincsSolution[constraint] = sum; } block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize; } kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration); } // Update the position of each atom. pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.atoms) { float4 atomPos = atomPositions[pos]; float invMass = cSim.pVelm4[pos].w; int num = cSim.pLincsNumAtomConstraints[pos]; for (int i = 0; i < num; i++) { int index = pos+i*cSim.atoms; int constraint = cSim.pLincsAtomConstraints[index]; float constraintForce = invMass*cSim.pLincsSolution[constraint]; constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce); float4 dir = cSim.pLincsDistance[constraint]; atomPos.x += constraintForce*dir.x; atomPos.y += constraintForce*dir.y; atomPos.z += constraintForce*dir.z; } atomPositions[pos] = atomPos; pos += blockDim.x*gridDim.x; } kSyncAllThreads_kernel(&cSim.pSyncCounter[2*gridDim.x], iteration); requiredIterations = cSim.pRequiredIterations[0]; } // Reset the initial sync counter to be ready for the next call. if (threadIdx.x == 0) cSim.pSyncCounter[blockIdx.x] = -1; } static void initInverseMatrices(gpuContext gpu) { // Build the inverse constraint matrix for each cluster. gpu->psPosq4->Download(); gpu->psVelm4->Download(); unsigned int elementIndex = 0; for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) { // Compute the constraint coupling matrix for this cluster. unsigned int startIndex = (*gpu->psRigidClusterConstraintIndex)[i]; unsigned int endIndex = (*gpu->psRigidClusterConstraintIndex)[i+1]; unsigned int size = endIndex-startIndex; vector r(size); for (unsigned int j = 0; j < size; j++) { int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]]; float4 pos1 = (*gpu->psPosq4)[atoms.x]; float4 pos2 = (*gpu->psPosq4)[atoms.y]; r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z); float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z); r[j].x *= invLength; r[j].y *= invLength; r[j].z *= invLength; } Array2D matrix(size, size); for (unsigned int j = 0; j < size; j++) { int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j]; int2 atomsj = (*gpu->psLincsAtoms)[constraintj]; for (unsigned int k = 0; k < size; k++) { int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k]; int2 atomsk = (*gpu->psLincsAtoms)[constraintk]; float invMassj0 = (*gpu->psVelm4)[atomsj.x].w; float invMassj1 = (*gpu->psVelm4)[atomsj.y].w; double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z; if (atomsj.x == atomsk.x) dot *= invMassj0/(invMassj0+invMassj1); else if (atomsj.y == atomsk.y) dot *= invMassj1/(invMassj0+invMassj1); else if (atomsj.x == atomsk.y) dot *= -invMassj0/(invMassj0+invMassj1); else if (atomsj.y == atomsk.x) dot *= -invMassj1/(invMassj0+invMassj1); else dot = 0.0; matrix[j][k] = dot; } matrix[j][j] = 1.0; } // Invert it using SVD. Array2D u, v; Array1D w; SVD svd(matrix); svd.getU(u); svd.getV(v); svd.getSingularValues(w); double singularValueCutoff = 0.01*w[0]; for (unsigned int j = 0; j < size; j++) w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]); for (unsigned int j = 0; j < size; j++) { for (unsigned int k = 0; k < size; k++) { matrix[j][k] = 0.0; for (unsigned int m = 0; m < size; m++) matrix[j][k] += v[j][m]*w[m]*u[k][m]; } } // Record the inverted matrix. (*gpu->psRigidClusterMatrixIndex)[i] = elementIndex; for (unsigned int j = 0; j < size; j++) { float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w; for (unsigned int k = 0; k < size; k++) { float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w; (*gpu->psRigidClusterMatrix)[elementIndex++] = matrix[k][j]*distance1/distance2; } } } (*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex; gpu->psRigidClusterMatrix->Upload(); gpu->psRigidClusterMatrixIndex->Upload(); gpu->hasInitializedRigidClusters = true; } void kApplyFirstCShake(gpuContext gpu) { // printf("kApplyFirstCShake\n"); if (gpu->sim.lincsConstraints > 0) { if (!gpu->hasInitializedRigidClusters) initInverseMatrices(gpu); kApplyCShake_kernel<<sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true); LAUNCHERROR("kApplyCShake"); } } void kApplySecondCShake(gpuContext gpu) { // printf("kApplySecondCShake\n"); if (gpu->sim.lincsConstraints > 0) { kApplyCShake_kernel<<sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosq, false); LAUNCHERROR("kApplyCShake"); } }