Commit f329a047 authored by Peter Eastman's avatar Peter Eastman
Browse files

Completed Cuda implementation of CSHAKE algorithm

parent 4ef9536e
...@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) ...@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include) CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src) CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
ENDFOREACH(subdir) ENDFOREACH(subdir)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
......
...@@ -356,6 +356,9 @@ struct cudaGmxSimulation { ...@@ -356,6 +356,9 @@ struct cudaGmxSimulation {
unsigned int ShakeConstraints; // Total number of Shake constraints unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints unsigned int settleConstraints; // Total number of Settle constraints
unsigned int lincsConstraints; // Total number of LINCS constraints. unsigned int lincsConstraints; // Total number of LINCS constraints.
unsigned int rigidClusters; // Total number of rigid clusters
unsigned int maxRigidClusterSize; // The size of the largest rigid cluster
unsigned int clusterShakeBlockSize; // The number of threads to process each rigid cluster
unsigned int NonShakeConstraints; // Total number of NonShake atoms unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system unsigned int degreesOfFreedom; // Number of degrees of freedom in system
...@@ -391,6 +394,10 @@ struct cudaGmxSimulation { ...@@ -391,6 +394,10 @@ struct cudaGmxSimulation {
short* pSyncCounter; // Used for global thread synchronization short* pSyncCounter; // Used for global thread synchronization
unsigned int* pRequiredIterations; // Used by SHAKE to communicate whether iteration has converged unsigned int* pRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
float* pShakeReducedMass; // The reduced mass for each SHAKE constraint float* pShakeReducedMass; // The reduced mass for each SHAKE constraint
int* pRigidClusterConstraints; // The constraints in each rigid cluster
float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster
unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
// Mutable stuff // Mutable stuff
float4* pPosq; // Pointer to atom positions and charges float4* pPosq; // Pointer to atom positions and charges
......
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include <sstream> #include <sstream>
#include <cmath> #include <cmath>
#include <map> #include <map>
#include <set>
#include <algorithm> #include <algorithm>
#ifdef WIN32 #ifdef WIN32
#include <windows.h> #include <windows.h>
...@@ -121,8 +122,6 @@ static const float BOLTZ = (RGAS / KILO); // (k ...@@ -121,8 +122,6 @@ static const float BOLTZ = (RGAS / KILO); // (k
#define DUMP_PARAMETERS 0 #define DUMP_PARAMETERS 0
#define DeltaShake
extern "C" extern "C"
void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k) void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
{ {
...@@ -458,7 +457,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie ...@@ -458,7 +457,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor; gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
} }
void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake) static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
{ {
cluster.valid = false; cluster.valid = false;
invalidForShake[cluster.centralID] = true; invalidForShake[cluster.centralID] = true;
...@@ -470,6 +469,116 @@ void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allC ...@@ -470,6 +469,116 @@ void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allC
} }
} }
static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, const vector<int>& constraintIndices)
{
vector<map<int, int> > atomConstraints(firstAtom.size());
for (int i = 0; i < constraintIndices.size(); i++) {
atomConstraints[firstAtom[i]][secondAtom[i]] = constraintIndices[i];
atomConstraints[secondAtom[i]][firstAtom[i]] = constraintIndices[i];
}
vector<vector<int> > rigidClusters;
int totalConstraints = 0;
int totalMatrixElements = 0;
for (int i = 0; i < firstAtom.size(); i++) {
if (atomConstraints[i].size() < 2)
continue;
// Begin by looking for a triangle this atom is part of.
set<int> atoms;
atoms.insert(i);
for (map<int, int>::const_iterator atom1 = atomConstraints[i].begin(); atom1 != atomConstraints[i].end() && atoms.size() == 1; ++atom1) {
for (map<int, int>::const_iterator atom2 = atomConstraints[atom1->first].begin(); atom2 != atomConstraints[atom1->first].end(); ++atom2) {
if (atomConstraints[i].count(atom2->first) != 0) {
atoms.insert(atom1->first);
atoms.insert(atom2->first);
break;
}
}
}
if (atoms.size() == 1)
continue;
// We have three atoms that are part of a cluster, so look for other atoms we can add.
bool done = false;
while (!done) {
done = true;
for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
if (atoms.find(atom2->first) != atoms.end())
continue; // This atom is already in the cluster.
// See if this atom is linked to three other atoms in the cluster.
int linkCount = 0;
for (map<int, int>::const_iterator atom3 = atomConstraints[atom2->first].begin(); atom3 != atomConstraints[atom2->first].end(); ++atom3)
if (atoms.find(atom3->first) != atoms.end())
linkCount++;
if (linkCount > 2) {
atoms.insert(atom2->first);
done = false;
}
}
}
}
// Record the cluster.
vector<int> constraints;
for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
if (*atom1 < atom2->first && atoms.find(atom2->first) != atoms.end())
constraints.push_back(atom2->second);
}
}
rigidClusters.push_back(constraints);
totalConstraints += constraints.size();
totalMatrixElements += constraints.size()*constraints.size();
for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2)
atomConstraints[atom2->first].erase(*atom1);
atomConstraints[*atom1].clear();
}
}
// Build the CUDA streams.
CUDAStream<int>* psRigidClusterConstraints = new CUDAStream<int>(totalConstraints, 1, "RigidClusterConstraints");
gpu->psRigidClusterConstraints = psRigidClusterConstraints;
gpu->sim.pRigidClusterConstraints = psRigidClusterConstraints->_pDevData;
CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex");
gpu->psRigidClusterConstraintIndex = psRigidClusterConstraintIndex;
gpu->sim.pRigidClusterConstraintIndex = psRigidClusterConstraintIndex->_pDevData;
CUDAStream<float>* psRigidClusterMatrix = new CUDAStream<float>(totalMatrixElements, 1, "RigidClusterMatrix");
gpu->psRigidClusterMatrix = psRigidClusterMatrix;
gpu->sim.pRigidClusterMatrix = psRigidClusterMatrix->_pDevData;
CUDAStream<unsigned int>* psRigidClusterMatrixIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterMatrixIndex");
gpu->psRigidClusterMatrixIndex = psRigidClusterMatrixIndex;
gpu->sim.pRigidClusterMatrixIndex = psRigidClusterMatrixIndex->_pDevData;
unsigned int constraintIndex = 0;
unsigned int maxClusterSize = 0;
for (unsigned int i = 0; i < rigidClusters.size(); i++) {
vector<int>& cluster = rigidClusters[i];
(*psRigidClusterConstraintIndex)[i] = constraintIndex;
for (unsigned int j = 0; j < cluster.size(); j++)
(*psRigidClusterConstraints)[constraintIndex++] = cluster[j];
if (cluster.size() > maxClusterSize)
maxClusterSize = cluster.size();
}
(*psRigidClusterConstraintIndex)[rigidClusters.size()] = constraintIndex;
gpu->sim.rigidClusters = rigidClusters.size();
gpu->sim.maxRigidClusterSize = maxClusterSize;
gpu->sim.clusterShakeBlockSize = 1;
while (gpu->sim.clusterShakeBlockSize < maxClusterSize)
gpu->sim.clusterShakeBlockSize *= 2;
if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
psRigidClusterConstraints->Upload();
psRigidClusterConstraintIndex->Upload();
gpu->hasInitializedRigidClusters = false;
}
extern "C" extern "C"
void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance, void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
const vector<float>& invMass1, const vector<float>& invMass2, float shakeTolerance, unsigned int lincsTerms) const vector<float>& invMass1, const vector<float>& invMass2, float shakeTolerance, unsigned int lincsTerms)
...@@ -730,7 +839,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -730,7 +839,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution"); CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
gpu->psLincsSolution = psLincsSolution; gpu->psLincsSolution = psLincsSolution;
gpu->sim.pLincsSolution = psLincsSolution->_pDevData; gpu->sim.pLincsSolution = psLincsSolution->_pDevData;
CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter"); CUDAStream<short>* psSyncCounter = new CUDAStream<short>(3*gpu->sim.blocks, 1, "SyncCounter");
gpu->psSyncCounter = psSyncCounter; gpu->psSyncCounter = psSyncCounter;
gpu->sim.pSyncCounter = psSyncCounter->_pDevData; gpu->sim.pSyncCounter = psSyncCounter->_pDevData;
CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations"); CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations");
...@@ -774,14 +883,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -774,14 +883,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks) if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
gpu->sim.lincs_threads_per_block = gpu->sim.blocks; gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
// Identify rigid clusters of atoms.
gpu->psLincsNumConnections->Download(); findRigidClusters(gpu, atom1, atom2, lincsConstraints);
gpu->psLincsConnections->Download();
gpu->psLincsCoupling->Download();
#ifdef DeltaShake
// count number of atoms w/o constraint // count number of atoms w/o constraint
...@@ -820,7 +924,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -820,7 +924,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
} else { } else {
gpu->sim.nonshake_threads_per_block = 0; gpu->sim.nonshake_threads_per_block = 0;
} }
#endif
} }
extern "C" extern "C"
...@@ -1152,6 +1255,10 @@ void* gpuInit(int numAtoms) ...@@ -1152,6 +1255,10 @@ void* gpuInit(int numAtoms)
gpu->psSyncCounter = NULL; gpu->psSyncCounter = NULL;
gpu->psRequiredIterations = NULL; gpu->psRequiredIterations = NULL;
gpu->psShakeReducedMass = NULL; gpu->psShakeReducedMass = NULL;
gpu->psRigidClusterConstraints = NULL;
gpu->psRigidClusterConstraintIndex = NULL;
gpu->psRigidClusterMatrix = NULL;
gpu->psRigidClusterMatrixIndex = NULL;
// Initialize output buffer before reading parameters // Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms]; gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
...@@ -1304,6 +1411,10 @@ void gpuShutDown(gpuContext gpu) ...@@ -1304,6 +1411,10 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psSyncCounter; delete gpu->psSyncCounter;
delete gpu->psRequiredIterations; delete gpu->psRequiredIterations;
delete gpu->psShakeReducedMass; delete gpu->psShakeReducedMass;
delete gpu->psRigidClusterConstraints;
delete gpu->psRigidClusterConstraintIndex;
delete gpu->psRigidClusterMatrix;
delete gpu->psRigidClusterMatrixIndex;
if (gpu->cudpp != 0) if (gpu->cudpp != 0)
cudppDestroyPlan(gpu->cudpp); cudppDestroyPlan(gpu->cudpp);
......
...@@ -80,6 +80,7 @@ struct _gpuContext { ...@@ -80,6 +80,7 @@ struct _gpuContext {
bool bRecalculateBornRadii; bool bRecalculateBornRadii;
bool bOutputBufferPerWarp; bool bOutputBufferPerWarp;
bool bIncludeGBSA; bool bIncludeGBSA;
bool hasInitializedRigidClusters;
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
CUDPPHandle cudpp; CUDPPHandle cudpp;
...@@ -143,6 +144,10 @@ struct _gpuContext { ...@@ -143,6 +144,10 @@ struct _gpuContext {
CUDAStream<short>* psSyncCounter; // Used for global thread synchronization CUDAStream<short>* psSyncCounter; // Used for global thread synchronization
CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
CUDAStream<float>* psShakeReducedMass; // The reduced mass for each SHAKE constraint CUDAStream<float>* psShakeReducedMass; // The reduced mass for each SHAKE constraint
CUDAStream<int>* psRigidClusterConstraints; // The constraints in each rigid cluster
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
}; };
typedef struct _gpuContext *gpuContext; typedef struct _gpuContext *gpuContext;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <cuda.h>
#include <vector_functions.h>
#include <vector>
#include "jama_svd.h"
#include "gputypes.h"
using namespace std;
using TNT::Array2D;
using JAMA::SVD;
static __constant__ cudaGmxSimulation cSim;
void SetCShakeSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCShakeSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
/**
* Synchronize all threads across all blocks.
*/
__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
{
__syncthreads();
if (threadIdx.x == 0)
syncCounter[blockIdx.x] = newCount;
if (threadIdx.x < gridDim.x)
{
volatile short* counter = &syncCounter[threadIdx.x];
do
{
} while (*counter != newCount);
}
__syncthreads();
}
__global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
{
extern __shared__ float temp[];
// Initialize counters used for monitoring convergence and doing global thread synchronization.
__shared__ unsigned int requiredIterations;
if (threadIdx.x == 0)
{
requiredIterations = 0;
cSim.pSyncCounter[gridDim.x+blockIdx.x] = -1;
cSim.pSyncCounter[2*gridDim.x+blockIdx.x] = -1;
cSim.pRequiredIterations[0] = 0;
}
// Calculate the direction of each constraint.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.lincsConstraints)
{
int2 atoms = cSim.pLincsAtoms[pos];
float4 dir = cSim.pLincsDistance[pos];
float4 oldPos1 = cSim.pOldPosq[atoms.x];
float4 oldPos2 = cSim.pOldPosq[atoms.y];
dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z;
cSim.pLincsDistance[pos] = dir;
pos += blockDim.x*gridDim.x;
}
__syncthreads();
// Iteratively update the atom positions.
unsigned int maxIterations = 150;
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
for (unsigned int iteration = 0; iteration < maxIterations && iteration == requiredIterations; iteration++)
{
// Calculate the constraint force for each constraint.
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.lincsConstraints)
{
int2 atoms = cSim.pLincsAtoms[pos];
float4 delta1 = atomPositions[atoms.x];
float4 delta2 = atomPositions[atoms.y];
float4 dir = cSim.pLincsDistance[pos];
float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
if (addOldPosition)
{
rp_ij.x += dir.x;
rp_ij.y += dir.y;
rp_ij.z += dir.z;
}
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w;
float diff = dist2 - rp2;
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
float reducedMass = cSim.pShakeReducedMass[pos];
cSim.pLincsSolution[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
requiredIterations = iteration+1;
pos += blockDim.x * gridDim.x;
}
kSyncAllThreads_kernel(cSim.pSyncCounter, iteration);
if (threadIdx.x == 0 && requiredIterations > iteration)
cSim.pRequiredIterations[0] = requiredIterations;
// Multiply by the inverse constraint matrix for each rigid cluster.
if (cSim.rigidClusters > 0)
{
pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int block = pos/cSim.clusterShakeBlockSize;
unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
while (block < cSim.rigidClusters)
{
unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block];
unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex;
if (indexInBlock < blockSize)
{
// Load the constraint forces and matrix.
unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock];
temp[threadIdx.x] = cSim.pLincsSolution[constraint];
unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
// Multiply by the matrix.
float sum = 0.0f;
for (unsigned int i = 0; i < blockSize; i++)
sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
cSim.pLincsSolution[constraint] = sum;
}
block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
}
kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration);
}
// Update the position of each atom.
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
{
float4 atomPos = atomPositions[pos];
float invMass = cSim.pVelm4[pos].w;
int num = cSim.pLincsNumAtomConstraints[pos];
for (int i = 0; i < num; i++)
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float constraintForce = invMass*cSim.pLincsSolution[constraint];
constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce);
float4 dir = cSim.pLincsDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z;
}
atomPositions[pos] = atomPos;
pos += blockDim.x*gridDim.x;
}
kSyncAllThreads_kernel(&cSim.pSyncCounter[2*gridDim.x], iteration);
requiredIterations = cSim.pRequiredIterations[0];
}
// Reset the initial sync counter to be ready for the next call.
if (threadIdx.x == 0)
cSim.pSyncCounter[blockIdx.x] = -1;
}
static void initInverseMatrices(gpuContext gpu)
{
// Build the inverse constraint matrix for each cluster.
gpu->psPosq4->Download();
gpu->psVelm4->Download();
unsigned int elementIndex = 0;
for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) {
// Compute the constraint coupling matrix for this cluster.
unsigned int startIndex = (*gpu->psRigidClusterConstraintIndex)[i];
unsigned int endIndex = (*gpu->psRigidClusterConstraintIndex)[i+1];
unsigned int size = endIndex-startIndex;
vector<float3> r(size);
for (unsigned int j = 0; j < size; j++) {
int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]];
float4 pos1 = (*gpu->psPosq4)[atoms.x];
float4 pos2 = (*gpu->psPosq4)[atoms.y];
r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z);
r[j].x *= invLength;
r[j].y *= invLength;
r[j].z *= invLength;
}
Array2D<double> matrix(size, size);
for (unsigned int j = 0; j < size; j++) {
int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j];
int2 atomsj = (*gpu->psLincsAtoms)[constraintj];
for (unsigned int k = 0; k < size; k++) {
int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k];
int2 atomsk = (*gpu->psLincsAtoms)[constraintk];
float invMassj0 = (*gpu->psVelm4)[atomsj.x].w;
float invMassj1 = (*gpu->psVelm4)[atomsj.y].w;
double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z;
if (atomsj.x == atomsk.x)
dot *= invMassj0/(invMassj0+invMassj1);
else if (atomsj.y == atomsk.y)
dot *= invMassj1/(invMassj0+invMassj1);
else if (atomsj.x == atomsk.y)
dot *= -invMassj0/(invMassj0+invMassj1);
else if (atomsj.y == atomsk.x)
dot *= -invMassj1/(invMassj0+invMassj1);
else
dot = 0.0;
matrix[j][k] = dot;
}
matrix[j][j] = 1.0;
}
// Invert it using SVD.
Array2D<double> u, v;
Array1D<double> w;
SVD<double> svd(matrix);
svd.getU(u);
svd.getV(v);
svd.getSingularValues(w);
double singularValueCutoff = 0.01*w[0];
for (unsigned int j = 0; j < size; j++)
w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
for (unsigned int j = 0; j < size; j++) {
for (unsigned int k = 0; k < size; k++) {
matrix[j][k] = 0.0;
for (unsigned int m = 0; m < size; m++)
matrix[j][k] += v[j][m]*w[m]*u[k][m];
}
}
// Record the inverted matrix.
(*gpu->psRigidClusterMatrixIndex)[i] = elementIndex;
for (unsigned int j = 0; j < size; j++)
{
float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w;
for (unsigned int k = 0; k < size; k++)
{
float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w;
(*gpu->psRigidClusterMatrix)[elementIndex++] = matrix[k][j]*distance1/distance2;
}
}
}
(*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex;
gpu->psRigidClusterMatrix->Upload();
gpu->psRigidClusterMatrixIndex->Upload();
gpu->hasInitializedRigidClusters = true;
}
void kApplyFirstCShake(gpuContext gpu)
{
// printf("kApplyFirstCShake\n");
if (gpu->sim.lincsConstraints > 0)
{
if (!gpu->hasInitializedRigidClusters)
initInverseMatrices(gpu);
kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
LAUNCHERROR("kApplyCShake");
}
}
void kApplySecondCShake(gpuContext gpu)
{
// printf("kApplySecondCShake\n");
if (gpu->sim.lincsConstraints > 0)
{
kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosq, false);
LAUNCHERROR("kApplyCShake");
}
}
...@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) ...@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include) CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src) CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
ENDFOREACH(subdir) ENDFOREACH(subdir)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
CUDA_ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) CUDA_ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
......
...@@ -317,8 +317,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -317,8 +317,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
static const RealOpenMM zero = 0.0; static const RealOpenMM zero = 0.0;
static const RealOpenMM one = 1.0; static const RealOpenMM one = 1.0;
static const RealOpenMM two = 2.0; static const RealOpenMM two = 2.0;
static const RealOpenMM three = 3.0;
static const RealOpenMM oneM = -1.0;
static const RealOpenMM half = 0.5; static const RealOpenMM half = 0.5;
static const RealOpenMM epsilon6 = (RealOpenMM) 1.0e-06; static const RealOpenMM epsilon6 = (RealOpenMM) 1.0e-06;
...@@ -327,8 +325,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -327,8 +325,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
int numberOfConstraints = getNumberOfConstraints();
// temp arrays // temp arrays
RealOpenMM** r_ij = _r_ij; RealOpenMM** r_ij = _r_ij;
...@@ -345,7 +341,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -345,7 +341,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
int atomJ = _atomIndices[ii][1]; int atomJ = _atomIndices[ii][1];
reducedMasses[ii] = half/( inverseMasses[atomI] + inverseMasses[atomJ] ); reducedMasses[ii] = half/( inverseMasses[atomI] + inverseMasses[atomJ] );
} }
vector<double> temp;
for (unsigned int i = 0; i < _rigidClusters.size(); i++) { for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
// Compute the constraint coupling matrix for this cluster. // Compute the constraint coupling matrix for this cluster.
...@@ -364,8 +359,8 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -364,8 +359,8 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
r[j][2] *= invLength; r[j][2] *= invLength;
} }
Array2D<double> matrix(size, size); Array2D<double> matrix(size, size);
for (int j = 0; j < (int)size; j++) { for (unsigned int j = 0; j < size; j++) {
for (int k = 0; k < (int)size; k++) { for (unsigned int k = 0; k < size; k++) {
double dot; double dot;
int atomj0 = _atomIndices[cluster[j]][0]; int atomj0 = _atomIndices[cluster[j]][0];
int atomj1 = _atomIndices[cluster[j]][1]; int atomj1 = _atomIndices[cluster[j]][1];
...@@ -395,23 +390,21 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -395,23 +390,21 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
svd.getV(v); svd.getV(v);
svd.getSingularValues(w); svd.getSingularValues(w);
double singularValueCutoff = 0.01*w[0]; double singularValueCutoff = 0.01*w[0];
for (int j = 0; j < (int)size; j++) for (unsigned int j = 0; j < size; j++)
w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]); w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
if (temp.size() < size) for (unsigned int j = 0; j < size; j++) {
temp.resize(size); for (unsigned int k = 0; k < size; k++) {
for (int j = 0; j < (int)size; j++) {
for (int k = 0; k < (int)size; k++) {
matrix[j][k] = 0.0; matrix[j][k] = 0.0;
for (int m = 0; m < (int)size; m++) for (unsigned int m = 0; m < size; m++)
matrix[j][k] += v[j][m]*w[m]*u[k][m]; matrix[j][k] += v[j][m]*w[m]*u[k][m];
} }
} }
// Record the inverted matrix. // Record the inverted matrix.
for (int j = 0; j < (int)size; j++) for (unsigned int j = 0; j < size; j++)
for (int k = 0; k < (int)size; k++) for (unsigned int k = 0; k < size; k++)
_matrices[i][j][k] = (RealOpenMM)matrix[j][k]; _matrices[i][j][k] = (RealOpenMM)matrix[j][k]*_distance[cluster[k]]/_distance[cluster[j]];
} }
} }
...@@ -478,12 +471,12 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -478,12 +471,12 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
unsigned int size = cluster.size(); unsigned int size = cluster.size();
if (size > tempForce.size()) if (size > tempForce.size())
tempForce.resize(size); tempForce.resize(size);
for (int j = 0; j < (int)size; j++) { for (unsigned int j = 0; j < size; j++) {
tempForce[j] = zero; tempForce[j] = zero;
for (int k = 0; k < (int)size; k++) for (unsigned int k = 0; k < size; k++)
tempForce[j] += matrix[j][k]*constraintForce[cluster[k]]*_distance[cluster[k]]/_distance[cluster[j]]; tempForce[j] += matrix[j][k]*constraintForce[cluster[k]];
} }
for (int j = 0; j < (int)size; j++) for (unsigned int j = 0; j < size; j++)
constraintForce[cluster[j]] = tempForce[j]; constraintForce[cluster[j]] = tempForce[j];
} }
...@@ -499,15 +492,6 @@ RealOpenMM damping = one;//(RealOpenMM) (iterations%2 == 0 ? 0.5 : 1.0); ...@@ -499,15 +492,6 @@ RealOpenMM damping = one;//(RealOpenMM) (iterations%2 == 0 ? 0.5 : 1.0);
} }
} }
} }
// static int sum = 0;
// static int count = 0;
// sum += iterations;
// count++;
// if (count == 100) {
// printf("%d iterations\n", sum);
// sum = 0;
// count = 0;
// }
// diagnostics // diagnostics
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment