Commit 47a6fb1e authored by Peter Eastman's avatar Peter Eastman
Browse files

Created initial CUDA implementation of new constraint algorithm

parent fa66c669
...@@ -402,6 +402,8 @@ struct cudaGmxSimulation { ...@@ -402,6 +402,8 @@ struct cudaGmxSimulation {
float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster
unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices. unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
// Mutable stuff // Mutable stuff
float4* pPosq; // Pointer to atom positions and charges float4* pPosq; // Pointer to atom positions and charges
......
...@@ -49,6 +49,7 @@ using namespace std; ...@@ -49,6 +49,7 @@ using namespace std;
#include "hilbert.h" #include "hilbert.h"
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
#include "jama_svd.h" #include "jama_svd.h"
#include "quern.h"
using OpenMM::OpenMMException; using OpenMM::OpenMMException;
using TNT::Array2D; using TNT::Array2D;
...@@ -562,22 +563,22 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons ...@@ -562,22 +563,22 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
// Reorder the constraints so those in a cluster are sequential. // Reorder the constraints so those in a cluster are sequential.
vector<int> constraintOrder(constraintIndices.size()); // vector<int> constraintOrder(constraintIndices.size());
vector<int> clusterStartIndex(rigidClusters.size()); // vector<int> clusterStartIndex(rigidClusters.size());
set<int> inCluster; // set<int> inCluster;
int nextIndex = 0; // int nextIndex = 0;
for (int i = 0; i < (int) rigidClusters.size(); ++i) { // for (int i = 0; i < (int) rigidClusters.size(); ++i) {
clusterStartIndex[i] = nextIndex; // clusterStartIndex[i] = nextIndex;
for (int j = 0; j < (int) rigidClusters[i].size(); ++j) { // for (int j = 0; j < (int) rigidClusters[i].size(); ++j) {
int constraint = rigidClusters[i][j]; // int constraint = rigidClusters[i][j];
constraintOrder[nextIndex++] = constraint; // constraintOrder[nextIndex++] = constraint;
inCluster.insert(constraint); // inCluster.insert(constraint);
} // }
} // }
for (int i = 0; i < (int) constraintIndices.size(); ++i) // for (int i = 0; i < (int) constraintIndices.size(); ++i)
if (inCluster.find(constraintIndices[i]) == inCluster.end()) // if (inCluster.find(constraintIndices[i]) == inCluster.end())
constraintOrder[nextIndex++] = constraintIndices[i]; // constraintOrder[nextIndex++] = constraintIndices[i];
constraintIndices = constraintOrder; // constraintIndices = constraintOrder;
// Build the CUDA streams. // Build the CUDA streams.
...@@ -931,6 +932,132 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -931,6 +932,132 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
for (unsigned i = 0; i < atomConstraints.size(); i++) for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size()); maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Compute the constraint coupling matrix
vector<vector<int> > atomAngles(gpu->natoms);
for (int i = 0; i < gpu->sim.bond_angles; i++)
atomAngles[(*gpu->psBondAngleID1)[i].y].push_back(i);
vector<vector<pair<int, double> > > matrix(numLincs);
if (numLincs > 0) {
for (int j = 0; j < numLincs; j++) {
for (int k = 0; k < numLincs; k++) {
if (j == k) {
matrix[j].push_back(pair<int, double>(j, 1.0));
continue;
}
double scale;
int atomj0 = atom1[j];
int atomj1 = atom2[j];
int atomk0 = atom1[k];
int atomk1 = atom2[k];
int atoma, atomb, atomc;
if (atomj0 == atomk0) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk1;
scale = invMass1[j]/(invMass1[j]+invMass2[j]);
}
else if (atomj1 == atomk1) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk0;
scale = invMass2[j]/(invMass1[j]+invMass2[j]);
}
else if (atomj0 == atomk1) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk0;
scale = invMass1[j]/(invMass1[j]+invMass2[j]);
}
else if (atomj1 == atomk0) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk1;
scale = invMass2[j]/(invMass1[j]+invMass2[j]);
}
else
continue; // These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
bool foundConstraint = false;
for (int other = 0; other < numLincs; other++) {
if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
double d1 = distance[j];
double d2 = distance[k];
double d3 = distance[other];
matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
foundConstraint = true;
break;
}
}
if (!foundConstraint) {
// We didn't find one, so look for an angle force field term.
const vector<int>& angleCandidates = atomAngles[atomb];
for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
int4 atoms = (*gpu->psBondAngleID1)[*iter];
if ((atoms.x == atoma && atoms.z == atomc) || (atoms.z == atoma && atoms.x == atomc)) {
double angle = (*gpu->psBondAngleParameter)[*iter].x;
matrix[j].push_back(pair<int, double>(k, scale*cos(angle*PI/180.0)));
break;
}
}
}
}
}
// Invert it using QR.
vector<int> matrixRowStart;
vector<int> matrixColIndex;
vector<double> matrixValue;
for (int i = 0; i < numLincs; i++) {
matrixRowStart.push_back(matrixValue.size());
for (int j = 0; j < (int) matrix[i].size(); j++) {
pair<int, double> element = matrix[i][j];
matrixColIndex.push_back(element.first);
matrixValue.push_back(element.second);
}
}
matrixRowStart.push_back(matrixValue.size());
int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
double *qValue, *rValue;
int result = QUERN_compute_qr(numLincs, numLincs, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
&qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
vector<double> rhs(numLincs);
matrix.clear();
matrix.resize(numLincs);
for (int i = 0; i < numLincs; i++) {
// Extract column i of the inverse matrix.
for (int j = 0; j < numLincs; j++)
rhs[j] = (i == j ? 1.0 : 0.0);
result = QUERN_multiply_with_q_transpose(numLincs, qRowStart, qColIndex, qValue, &rhs[0]);
result = QUERN_solve_with_r(numLincs, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numLincs; j++) {
double value = rhs[j]*distance[i]/distance[j];
if (abs(value) > 0.02)
matrix[j].push_back(pair<int, double>(i, value));
}
}
QUERN_free_result(qRowStart, qColIndex, qValue);
QUERN_free_result(rRowStart, rColIndex, rValue);
}
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Fill in the CUDA streams. // Fill in the CUDA streams.
CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>(numLincs, 1, "LincsAtoms"); CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>(numLincs, 1, "LincsAtoms");
...@@ -975,6 +1102,12 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -975,6 +1102,12 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psShakeReducedMass = new CUDAStream<float>(numLincs, 1, "LincsSolution"); CUDAStream<float>* psShakeReducedMass = new CUDAStream<float>(numLincs, 1, "LincsSolution");
gpu->psShakeReducedMass = psShakeReducedMass; gpu->psShakeReducedMass = psShakeReducedMass;
gpu->sim.pShakeReducedMass = psShakeReducedMass->_pDevData; gpu->sim.pShakeReducedMass = psShakeReducedMass->_pDevData;
CUDAStream<unsigned int>* psConstraintMatrixColumn = new CUDAStream<unsigned int>(numLincs*maxRowElements, 1, "ConstraintMatrixColumn");
gpu->psConstraintMatrixColumn = psConstraintMatrixColumn;
gpu->sim.pConstraintMatrixColumn = psConstraintMatrixColumn->_pDevData;
CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numLincs*maxRowElements, 1, "ConstraintMatrixValue");
gpu->psConstraintMatrixValue = psConstraintMatrixValue;
gpu->sim.pConstraintMatrixValue = psConstraintMatrixValue->_pDevData;
gpu->sim.lincsConstraints = numLincs; gpu->sim.lincsConstraints = numLincs;
for (int i = 0; i < numLincs; i++) { for (int i = 0; i < numLincs; i++) {
int c = lincsConstraints[i]; int c = lincsConstraints[i];
...@@ -986,6 +1119,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -986,6 +1119,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
(*psLincsNumConnections)[i] = linkedConstraints[i].size(); (*psLincsNumConnections)[i] = linkedConstraints[i].size();
for (unsigned int j = 0; j < linkedConstraints[i].size(); j++) for (unsigned int j = 0; j < linkedConstraints[i].size(); j++)
(*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j]; (*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
for (unsigned int j = 0; j < matrix[i].size(); j++) {
(*psConstraintMatrixColumn)[i+j*numLincs] = matrix[i][j].first;
(*psConstraintMatrixValue)[i+j*numLincs] = matrix[i][j].second;
}
(*psConstraintMatrixColumn)[i+matrix[i].size()*numLincs] = numLincs;
} }
for (unsigned int i = 0; i < psSyncCounter->_length; i++) for (unsigned int i = 0; i < psSyncCounter->_length; i++)
(*psSyncCounter)[i] = -1; (*psSyncCounter)[i] = -1;
...@@ -1005,6 +1143,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1005,6 +1143,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
psLincsAtomConstraints->Upload(); psLincsAtomConstraints->Upload();
psLincsNumAtomConstraints->Upload(); psLincsNumAtomConstraints->Upload();
psSyncCounter->Upload(); psSyncCounter->Upload();
psConstraintMatrixColumn->Upload();
psConstraintMatrixValue->Upload();
gpu->sim.lincsTerms = lincsTerms; gpu->sim.lincsTerms = lincsTerms;
gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.lincs_threads_per_block > gpu->sim.threads_per_block) if (gpu->sim.lincs_threads_per_block > gpu->sim.threads_per_block)
...@@ -1391,6 +1531,8 @@ void* gpuInit(int numAtoms) ...@@ -1391,6 +1531,8 @@ void* gpuInit(int numAtoms)
gpu->psRigidClusterConstraintIndex = NULL; gpu->psRigidClusterConstraintIndex = NULL;
gpu->psRigidClusterMatrix = NULL; gpu->psRigidClusterMatrix = NULL;
gpu->psRigidClusterMatrixIndex = NULL; gpu->psRigidClusterMatrixIndex = NULL;
gpu->psConstraintMatrixColumn = NULL;
gpu->psConstraintMatrixValue = NULL;
// Initialize output buffer before reading parameters // Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms]; gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
...@@ -1492,9 +1634,9 @@ void gpuShutDown(gpuContext gpu) ...@@ -1492,9 +1634,9 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psxVector4; delete gpu->psxVector4;
delete gpu->psvVector4; delete gpu->psvVector4;
delete gpu->psSigEps2; delete gpu->psSigEps2;
delete gpu->psEwaldEikr; delete gpu->psEwaldEikr;
delete gpu->psEwaldStructureFactor; delete gpu->psEwaldStructureFactor;
delete gpu->psEwaldCosSinSum; delete gpu->psEwaldCosSinSum;
delete gpu->psObcData; delete gpu->psObcData;
delete gpu->psObcChain; delete gpu->psObcChain;
delete gpu->psBornForce; delete gpu->psBornForce;
...@@ -1549,6 +1691,8 @@ void gpuShutDown(gpuContext gpu) ...@@ -1549,6 +1691,8 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psRigidClusterConstraintIndex; delete gpu->psRigidClusterConstraintIndex;
delete gpu->psRigidClusterMatrix; delete gpu->psRigidClusterMatrix;
delete gpu->psRigidClusterMatrixIndex; delete gpu->psRigidClusterMatrixIndex;
delete gpu->psConstraintMatrixColumn;
delete gpu->psConstraintMatrixValue;
if (gpu->cudpp != 0) if (gpu->cudpp != 0)
cudppDestroyPlan(gpu->cudpp); cudppDestroyPlan(gpu->cudpp);
......
...@@ -144,6 +144,8 @@ struct _gpuContext { ...@@ -144,6 +144,8 @@ struct _gpuContext {
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices. CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
CUDAStream<unsigned int>* psConstraintMatrixColumn; // The column of each element in the constraint matrix.
CUDAStream<float>* psConstraintMatrixValue; // The value of each element in the constraint matrix.
}; };
typedef struct _gpuContext *gpuContext; typedef struct _gpuContext *gpuContext;
......
...@@ -127,7 +127,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -127,7 +127,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z; float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z; float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
float reducedMass = cSim.pShakeReducedMass[pos]; float reducedMass = cSim.pShakeReducedMass[pos];
cSim.pLincsSolution[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f); cSim.pLincsRhs1[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
requiredIterations = iteration+1; requiredIterations = iteration+1;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
...@@ -138,33 +138,49 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -138,33 +138,49 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
// Multiply by the inverse constraint matrix for each rigid cluster. // Multiply by the inverse constraint matrix for each rigid cluster.
if (cSim.rigidClusters > 0) // if (cSim.rigidClusters > 0)
// {
// pos = threadIdx.x + blockIdx.x * blockDim.x;
// unsigned int block = pos/cSim.clusterShakeBlockSize;
// unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
// while (block < cSim.rigidClusters)
// {
// unsigned int firstConstraint = cSim.pRigidClusterConstraintIndex[block];
// unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstConstraint;
// if (indexInBlock < blockSize)
// {
// // Load the constraint forces and matrix.
//
// temp[threadIdx.x] = cSim.pLincsSolution[firstConstraint+indexInBlock];
// unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
//
// // Multiply by the matrix.
//
// float sum = 0.0f;
// for (unsigned int i = 0; i < blockSize; i++)
// sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
// cSim.pLincsSolution[firstConstraint+indexInBlock] = sum;
// }
// block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
// }
// kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration);
// }
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.lincsConstraints)
{ {
pos = threadIdx.x + blockIdx.x * blockDim.x; float sum = 0.0f;
unsigned int block = pos/cSim.clusterShakeBlockSize; for (unsigned int i = 0; ; i++)
unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
while (block < cSim.rigidClusters)
{ {
unsigned int firstConstraint = cSim.pRigidClusterConstraintIndex[block]; int index = pos+i*cSim.lincsConstraints;
unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstConstraint; unsigned int column = cSim.pConstraintMatrixColumn[pos+i*cSim.lincsConstraints];
if (indexInBlock < blockSize) if (column >= cSim.lincsConstraints)
{ break;
// Load the constraint forces and matrix. sum += cSim.pLincsRhs1[column]*cSim.pConstraintMatrixValue[index];
temp[threadIdx.x] = cSim.pLincsSolution[firstConstraint+indexInBlock];
unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
// Multiply by the matrix.
float sum = 0.0f;
for (unsigned int i = 0; i < blockSize; i++)
sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
cSim.pLincsSolution[firstConstraint+indexInBlock] = sum;
}
block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
} }
kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration); cSim.pLincsSolution[pos] = sum;
pos += blockDim.x * gridDim.x;
} }
kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration);
// Update the position of each atom. // Update the position of each atom.
......
...@@ -201,7 +201,7 @@ ReferenceRigidShakeAlgorithm::ReferenceRigidShakeAlgorithm( int numberOfAtoms, ...@@ -201,7 +201,7 @@ ReferenceRigidShakeAlgorithm::ReferenceRigidShakeAlgorithm( int numberOfAtoms,
QUERN_solve_with_r(numberOfConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]); QUERN_solve_with_r(numberOfConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numberOfConstraints; j++) { for (int j = 0; j < numberOfConstraints; j++) {
double value = rhs[j]*_distance[i]/_distance[j]; double value = rhs[j]*_distance[i]/_distance[j];
if (abs(value) > 0.01) if (abs(value) > 0.02)
_matrix[j].push_back(pair<int, RealOpenMM>(i, (RealOpenMM) value)); _matrix[j].push_back(pair<int, RealOpenMM>(i, (RealOpenMM) value));
} }
} }
...@@ -600,7 +600,7 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo ...@@ -600,7 +600,7 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
numberConverged++; numberConverged++;
} }
} }
if( numberConverged == _numberOfConstraints ){ if( numberConverged == _numberOfConstraints ){
done = true; done = true;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment