Commit 043c7b6c authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to CUDA C-SHAKE implementation

parent fe1e6ffa
......@@ -389,7 +389,6 @@ struct cudaGmxSimulation {
short* pSyncCounter; // Used for global thread synchronization
unsigned int* pRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
float* pShakeReducedMass; // The reduced mass for each SHAKE constraint
int* pRigidClusterConstraints; // The constraints in each rigid cluster
float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster
unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
......
......@@ -464,7 +464,7 @@ static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster
}
}
static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, const vector<int>& constraintIndices)
static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, vector<int>& constraintIndices)
{
vector<map<int, int> > atomConstraints(firstAtom.size());
for (int i = 0; i < (int)constraintIndices.size(); i++) {
......@@ -537,11 +537,27 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
}
}
// Reorder the constraints so those in a cluster are sequential.
vector<int> constraintOrder(constraintIndices.size());
vector<int> clusterStartIndex(rigidClusters.size());
set<int> inCluster;
int nextIndex = 0;
for (int i = 0; i < (int) rigidClusters.size(); ++i) {
clusterStartIndex[i] = nextIndex;
for (int j = 0; j < (int) rigidClusters[i].size(); ++j) {
int constraint = rigidClusters[i][j];
constraintOrder[nextIndex++] = constraint;
inCluster.insert(constraint);
}
}
for (int i = 0; i < (int) constraintIndices.size(); ++i)
if (inCluster.find(constraintIndices[i]) == inCluster.end())
constraintOrder[nextIndex++] = constraintIndices[i];
constraintIndices = constraintOrder;
// Build the CUDA streams.
CUDAStream<int>* psRigidClusterConstraints = new CUDAStream<int>(totalConstraints, 1, "RigidClusterConstraints");
gpu->psRigidClusterConstraints = psRigidClusterConstraints;
gpu->sim.pRigidClusterConstraints = psRigidClusterConstraints->_pDevData;
CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex");
gpu->psRigidClusterConstraintIndex = psRigidClusterConstraintIndex;
gpu->sim.pRigidClusterConstraintIndex = psRigidClusterConstraintIndex->_pDevData;
......@@ -556,8 +572,7 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
for (unsigned int i = 0; i < rigidClusters.size(); i++) {
vector<int>& cluster = rigidClusters[i];
(*psRigidClusterConstraintIndex)[i] = constraintIndex;
for (unsigned int j = 0; j < cluster.size(); j++)
(*psRigidClusterConstraints)[constraintIndex++] = cluster[j];
constraintIndex += cluster.size();
if (cluster.size() > maxClusterSize)
maxClusterSize = cluster.size();
}
......@@ -567,9 +582,6 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
gpu->sim.clusterShakeBlockSize = 1;
while (gpu->sim.clusterShakeBlockSize < maxClusterSize)
gpu->sim.clusterShakeBlockSize *= 2;
if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
psRigidClusterConstraints->Upload();
psRigidClusterConstraintIndex->Upload();
gpu->hasInitializedRigidClusters = false;
}
......@@ -776,6 +788,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]])
lincsConstraints.push_back(i);
// Identify rigid clusters of atoms.
findRigidClusters(gpu, atom1, atom2, lincsConstraints);
// Record the connections between constraints.
int numLincs = (int) lincsConstraints.size();
vector<vector<int> > atomConstraints(gpu->natoms);
for (int i = 0; i < numLincs; i++) {
......@@ -859,8 +878,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
(*psSyncCounter)[i] = -1;
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
(*psLincsAtomConstraints)[i+j*gpu->natoms] = atomConstraints[i][j];
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[lincsConstraints[atomConstraints[i][j]]] == i);
(*psLincsAtomConstraints)[i+j*gpu->natoms] = (forward ? atomConstraints[i][j]+1 : -atomConstraints[i][j]-1);
}
}
psLincsAtoms->Upload();
psLincsDistance->Upload();
......@@ -877,10 +898,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.lincs_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
// Identify rigid clusters of atoms.
findRigidClusters(gpu, atom1, atom2, lincsConstraints);
if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
// count number of atoms w/o constraint
......@@ -1250,7 +1269,6 @@ void* gpuInit(int numAtoms)
gpu->psSyncCounter = NULL;
gpu->psRequiredIterations = NULL;
gpu->psShakeReducedMass = NULL;
gpu->psRigidClusterConstraints = NULL;
gpu->psRigidClusterConstraintIndex = NULL;
gpu->psRigidClusterMatrix = NULL;
gpu->psRigidClusterMatrixIndex = NULL;
......@@ -1406,7 +1424,6 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psSyncCounter;
delete gpu->psRequiredIterations;
delete gpu->psShakeReducedMass;
delete gpu->psRigidClusterConstraints;
delete gpu->psRigidClusterConstraintIndex;
delete gpu->psRigidClusterMatrix;
delete gpu->psRigidClusterMatrixIndex;
......
......@@ -139,7 +139,6 @@ struct _gpuContext {
CUDAStream<short>* psSyncCounter; // Used for global thread synchronization
CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
CUDAStream<float>* psShakeReducedMass; // The reduced mass for each SHAKE constraint
CUDAStream<int>* psRigidClusterConstraints; // The constraints in each rigid cluster
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
......
......@@ -148,14 +148,13 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
while (block < cSim.rigidClusters)
{
unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block];
unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex;
unsigned int firstConstraint = cSim.pRigidClusterConstraintIndex[block];
unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstConstraint;
if (indexInBlock < blockSize)
{
// Load the constraint forces and matrix.
unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock];
temp[threadIdx.x] = cSim.pLincsSolution[constraint];
temp[threadIdx.x] = cSim.pLincsSolution[firstConstraint+indexInBlock];
unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
// Multiply by the matrix.
......@@ -163,7 +162,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
float sum = 0.0f;
for (unsigned int i = 0; i < blockSize; i++)
sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
cSim.pLincsSolution[constraint] = sum;
cSim.pLincsSolution[firstConstraint+indexInBlock] = sum;
}
block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
}
......@@ -173,6 +172,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
// Update the position of each atom.
pos = threadIdx.x + blockIdx.x * blockDim.x;
float damping = (iteration < 2 ? 0.5f : 1.0f);
while (pos < cSim.atoms)
{
float4 atomPos = atomPositions[pos];
......@@ -182,8 +182,10 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float constraintForce = invMass*cSim.pLincsSolution[constraint];
constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce);
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*cSim.pLincsSolution[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = cSim.pLincsDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
......@@ -202,12 +204,14 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
cSim.pSyncCounter[blockIdx.x] = -1;
}
static void initInverseMatrices(gpuContext gpu)
static void initInverseMatrices(gpuContext gpu, bool useNewPositions)
{
// Build the inverse constraint matrix for each cluster.
gpu->psPosq4->Download();
gpu->psVelm4->Download();
if (useNewPositions)
gpu->psPosqP4->Download();
unsigned int elementIndex = 0;
for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) {
// Compute the constraint coupling matrix for this cluster.
......@@ -217,9 +221,24 @@ static void initInverseMatrices(gpuContext gpu)
unsigned int size = endIndex-startIndex;
vector<float3> r(size);
for (unsigned int j = 0; j < size; j++) {
int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]];
float4 pos1 = (*gpu->psPosq4)[atoms.x];
float4 pos2 = (*gpu->psPosq4)[atoms.y];
int2 atoms = (*gpu->psLincsAtoms)[startIndex+j];
float4 pos1, pos2;
if (useNewPositions) {
float4 oldpos1 = (*gpu->psPosq4)[atoms.x];
float4 oldpos2 = (*gpu->psPosq4)[atoms.y];
pos1 = (*gpu->psPosqP4)[atoms.x];
pos2 = (*gpu->psPosqP4)[atoms.y];
pos1.x += oldpos1.x;
pos1.y += oldpos1.y;
pos1.z += oldpos1.z;
pos2.x += oldpos2.x;
pos2.y += oldpos2.y;
pos2.z += oldpos2.z;
}
else {
pos1 = (*gpu->psPosq4)[atoms.x];
pos2 = (*gpu->psPosq4)[atoms.y];
}
r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z);
r[j].x *= invLength;
......@@ -228,11 +247,9 @@ static void initInverseMatrices(gpuContext gpu)
}
Array2D<double> matrix(size, size);
for (int j = 0; j < (int)size; j++) {
int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j];
int2 atomsj = (*gpu->psLincsAtoms)[constraintj];
int2 atomsj = (*gpu->psLincsAtoms)[startIndex+j];
for (int k = 0; k < (int)size; k++) {
int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k];
int2 atomsk = (*gpu->psLincsAtoms)[constraintk];
int2 atomsk = (*gpu->psLincsAtoms)[startIndex+k];
float invMassj0 = (*gpu->psVelm4)[atomsj.x].w;
float invMassj1 = (*gpu->psVelm4)[atomsj.y].w;
double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z;
......@@ -275,10 +292,10 @@ static void initInverseMatrices(gpuContext gpu)
(*gpu->psRigidClusterMatrixIndex)[i] = elementIndex;
for (int j = 0; j < (int)size; j++)
{
float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w;
float distance1 = (*gpu->psLincsDistance)[startIndex+j].w;
for (int k = 0; k < (int)size; k++)
{
float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w;
float distance2 = (*gpu->psLincsDistance)[startIndex+k].w;
(*gpu->psRigidClusterMatrix)[elementIndex++] = (float)(matrix[k][j]*distance1/distance2);
}
}
......@@ -286,7 +303,6 @@ static void initInverseMatrices(gpuContext gpu)
(*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex;
gpu->psRigidClusterMatrix->Upload();
gpu->psRigidClusterMatrixIndex->Upload();
gpu->hasInitializedRigidClusters = true;
}
void kApplyFirstCShake(gpuContext gpu)
......@@ -295,9 +311,20 @@ void kApplyFirstCShake(gpuContext gpu)
if (gpu->sim.lincsConstraints > 0)
{
if (!gpu->hasInitializedRigidClusters)
initInverseMatrices(gpu);
{
// Build preliminary constraint matrices for use on this call.
initInverseMatrices(gpu, false);
}
kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
LAUNCHERROR("kApplyCShake");
if (!gpu->hasInitializedRigidClusters)
{
// Rebuild the constraint matrices, now that we know all constraints are really satisfied.
initInverseMatrices(gpu, true);
gpu->hasInitializedRigidClusters = true;
}
}
}
......
......@@ -101,9 +101,11 @@ __global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
c = (forward ? -c : c);
atomPos.x += c*dir.x;
atomPos.y += c*dir.y;
atomPos.z += c*dir.z;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment