Commit 95d1c382 authored by Peter Eastman's avatar Peter Eastman
Browse files

Improved LINCS performance by eliminating some uncoalesced memory access

parent 297538f7
...@@ -380,14 +380,14 @@ struct cudaGmxSimulation { ...@@ -380,14 +380,14 @@ struct cudaGmxSimulation {
int2* pLincsAtoms; // The atoms connected by each LINCS constraint int2* pLincsAtoms; // The atoms connected by each LINCS constraint
float4* pLincsDistance; // The displacement vector (x, y, z) and constraint distance (w) for each LINCS constraint float4* pLincsDistance; // The displacement vector (x, y, z) and constraint distance (w) for each LINCS constraint
int* pLincsConnections; // The indices of constraints that other constraints are connected to int* pLincsConnections; // The indices of constraints that other constraints are connected to
int* pLincsConnectionsIndex; // The index in pLincsConnections at which the connections for a particular constraint start int* pLincsNumConnections; // The number of other constraints that each constraint is linked to
float* pLincsS; // S matrix for LINCS float* pLincsS; // S matrix for LINCS
float* pLincsCoupling; // Coupling matrix for LINCS float* pLincsCoupling; // Coupling matrix for LINCS
float* pLincsRhs1; // Workspace for LINCS float* pLincsRhs1; // Workspace for LINCS
float* pLincsRhs2; // Workspace for LINCS float* pLincsRhs2; // Workspace for LINCS
float* pLincsSolution; // Workspace for LINCS float* pLincsSolution; // Workspace for LINCS
int* pLincsAtomConstraints; // The indices of constraints involving each atom int* pLincsAtomConstraints; // The indices of constraints involving each atom
int* pLincsAtomConstraintsIndex; // The index in psLincsAtomConstraints at which the constraints for a particular atom start int* pLincsNumAtomConstraints; // The number of constraints involving each atom
unsigned int* pSyncCounter; // Used for global thread synchronization unsigned int* pSyncCounter; // Used for global thread synchronization
// Mutable stuff // Mutable stuff
......
...@@ -670,12 +670,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -670,12 +670,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
for (unsigned int i = 0; i < atom1.size(); i++) for (unsigned int i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]]) if (!isShakeAtom[atom1[i]])
lincsConstraints.push_back(i); lincsConstraints.push_back(i);
int numLincs = (int) lincsConstraints.size();
vector<vector<int> > atomConstraints(gpu->natoms); vector<vector<int> > atomConstraints(gpu->natoms);
for (unsigned int i = 0; i < lincsConstraints.size(); i++) { for (int i = 0; i < numLincs; i++) {
atomConstraints[atom1[lincsConstraints[i]]].push_back(i); atomConstraints[atom1[lincsConstraints[i]]].push_back(i);
atomConstraints[atom2[lincsConstraints[i]]].push_back(i); atomConstraints[atom2[lincsConstraints[i]]].push_back(i);
} }
vector<vector<int> > linkedConstraints(lincsConstraints.size()); vector<vector<int> > linkedConstraints(numLincs);
for (unsigned int atom = 0; atom < atomConstraints.size(); atom++) { for (unsigned int atom = 0; atom < atomConstraints.size(); atom++) {
for (unsigned int i = 0; i < atomConstraints[atom].size(); i++) for (unsigned int i = 0; i < atomConstraints[atom].size(); i++)
for (int j = 0; j < i; j++) { for (int j = 0; j < i; j++) {
...@@ -685,77 +686,76 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -685,77 +686,76 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
linkedConstraints[c2].push_back(c1); linkedConstraints[c2].push_back(c1);
} }
} }
int totalLinks = 0; int maxLinks = 0;
for (unsigned int i = 0; i < linkedConstraints.size(); i++) for (unsigned int i = 0; i < linkedConstraints.size(); i++)
totalLinks += linkedConstraints[i].size(); maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
int maxAtomConstraints = 0;
for (unsigned int i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Fill in the CUDA streams. // Fill in the CUDA streams.
CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>((int) lincsConstraints.size(), 1, "LincsAtoms"); CUDAStream<int2>* psLincsAtoms = new CUDAStream<int2>(numLincs, 1, "LincsAtoms");
gpu->psLincsAtoms = psLincsAtoms; gpu->psLincsAtoms = psLincsAtoms;
gpu->sim.pLincsAtoms = psLincsAtoms->_pDevData; gpu->sim.pLincsAtoms = psLincsAtoms->_pDevData;
CUDAStream<float4>* psLincsDistance = new CUDAStream<float4>((int) lincsConstraints.size(), 1, "LincsDistance"); CUDAStream<float4>* psLincsDistance = new CUDAStream<float4>(numLincs, 1, "LincsDistance");
gpu->psLincsDistance = psLincsDistance; gpu->psLincsDistance = psLincsDistance;
gpu->sim.pLincsDistance = psLincsDistance->_pDevData; gpu->sim.pLincsDistance = psLincsDistance->_pDevData;
CUDAStream<int>* psLincsConnections = new CUDAStream<int>(totalLinks, 1, "LincsConnections"); CUDAStream<int>* psLincsConnections = new CUDAStream<int>(numLincs*maxLinks, 1, "LincsConnections");
gpu->psLincsConnections = psLincsConnections; gpu->psLincsConnections = psLincsConnections;
gpu->sim.pLincsConnections = psLincsConnections->_pDevData; gpu->sim.pLincsConnections = psLincsConnections->_pDevData;
CUDAStream<int>* psLincsConnectionsIndex = new CUDAStream<int>((int) lincsConstraints.size()+1, 1, "LincsConnectionsIndex"); CUDAStream<int>* psLincsNumConnections = new CUDAStream<int>(numLincs, 1, "LincsConnectionsIndex");
gpu->psLincsConnectionsIndex = psLincsConnectionsIndex; gpu->psLincsNumConnections = psLincsNumConnections;
gpu->sim.pLincsConnectionsIndex = psLincsConnectionsIndex->_pDevData; gpu->sim.pLincsNumConnections = psLincsNumConnections->_pDevData;
CUDAStream<int>* psLincsAtomConstraints = new CUDAStream<int>((int) lincsConstraints.size()*2, 1, "LincsAtomConstraints"); CUDAStream<int>* psLincsAtomConstraints = new CUDAStream<int>(gpu->natoms*maxAtomConstraints, 1, "LincsAtomConstraints");
gpu->psLincsAtomConstraints = psLincsAtomConstraints; gpu->psLincsAtomConstraints = psLincsAtomConstraints;
gpu->sim.pLincsAtomConstraints = psLincsAtomConstraints->_pDevData; gpu->sim.pLincsAtomConstraints = psLincsAtomConstraints->_pDevData;
CUDAStream<int>* psLincsAtomConstraintsIndex = new CUDAStream<int>(gpu->natoms+1, 1, "LincsAtomConstraintsIndex"); CUDAStream<int>* psLincsNumAtomConstraints = new CUDAStream<int>(gpu->natoms, 1, "LincsAtomConstraintsIndex");
gpu->psLincsAtomConstraintsIndex = psLincsAtomConstraintsIndex; gpu->psLincsNumAtomConstraints = psLincsNumAtomConstraints;
gpu->sim.pLincsAtomConstraintsIndex = psLincsAtomConstraintsIndex->_pDevData; gpu->sim.pLincsNumAtomConstraints = psLincsNumAtomConstraints->_pDevData;
CUDAStream<float>* psLincsS = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsS"); CUDAStream<float>* psLincsS = new CUDAStream<float>(numLincs, 1, "LincsS");
gpu->psLincsS = psLincsS; gpu->psLincsS = psLincsS;
gpu->sim.pLincsS = psLincsS->_pDevData; gpu->sim.pLincsS = psLincsS->_pDevData;
CUDAStream<float>* psLincsCoupling = new CUDAStream<float>(totalLinks, 1, "LincsCoupling"); CUDAStream<float>* psLincsCoupling = new CUDAStream<float>(numLincs*maxLinks, 1, "LincsCoupling");
gpu->psLincsCoupling = psLincsCoupling; gpu->psLincsCoupling = psLincsCoupling;
gpu->sim.pLincsCoupling = psLincsCoupling->_pDevData; gpu->sim.pLincsCoupling = psLincsCoupling->_pDevData;
CUDAStream<float>* psLincsRhs1 = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsRhs1"); CUDAStream<float>* psLincsRhs1 = new CUDAStream<float>(numLincs, 1, "LincsRhs1");
gpu->psLincsRhs1 = psLincsRhs1; gpu->psLincsRhs1 = psLincsRhs1;
gpu->sim.pLincsRhs1 = psLincsRhs1->_pDevData; gpu->sim.pLincsRhs1 = psLincsRhs1->_pDevData;
CUDAStream<float>* psLincsRhs2 = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsRhs2"); CUDAStream<float>* psLincsRhs2 = new CUDAStream<float>(numLincs, 1, "LincsRhs2");
gpu->psLincsRhs2 = psLincsRhs2; gpu->psLincsRhs2 = psLincsRhs2;
gpu->sim.pLincsRhs2 = psLincsRhs2->_pDevData; gpu->sim.pLincsRhs2 = psLincsRhs2->_pDevData;
CUDAStream<float>* psLincsSolution = new CUDAStream<float>((int) lincsConstraints.size(), 1, "LincsSolution"); CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
gpu->psLincsSolution = psLincsSolution; gpu->psLincsSolution = psLincsSolution;
gpu->sim.pLincsSolution = psLincsSolution->_pDevData; gpu->sim.pLincsSolution = psLincsSolution->_pDevData;
CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter"); CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter");
gpu->psSyncCounter = psSyncCounter; gpu->psSyncCounter = psSyncCounter;
gpu->sim.pSyncCounter = psSyncCounter->_pDevData; gpu->sim.pSyncCounter = psSyncCounter->_pDevData;
gpu->sim.lincsConstraints = lincsConstraints.size(); gpu->sim.lincsConstraints = numLincs;
index = 0; for (int i = 0; i < numLincs; i++) {
for (unsigned int i = 0; i < lincsConstraints.size(); i++) {
int c = lincsConstraints[i]; int c = lincsConstraints[i];
(*psLincsAtoms)[i].x = atom1[c]; (*psLincsAtoms)[i].x = atom1[c];
(*psLincsAtoms)[i].y = atom2[c]; (*psLincsAtoms)[i].y = atom2[c];
(*psLincsDistance)[i].w = distance[c]; (*psLincsDistance)[i].w = distance[c];
(*psLincsS)[i] = 1.0f/sqrt(invMass1[c]+invMass2[c]); (*psLincsS)[i] = 1.0f/sqrt(invMass1[c]+invMass2[c]);
(*psLincsConnectionsIndex)[i] = index; (*psLincsNumConnections)[i] = linkedConstraints[i].size();
for (unsigned int j = 0; j < linkedConstraints[i].size(); j++) for (unsigned int j = 0; j < linkedConstraints[i].size(); j++)
(*psLincsConnections)[index++] = linkedConstraints[i][j]; (*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
} }
(*psLincsConnectionsIndex)[lincsConstraints.size()] = index;
for (unsigned int i = 0; i < psSyncCounter->_length; i++) for (unsigned int i = 0; i < psSyncCounter->_length; i++)
(*psSyncCounter)[i] = 0; (*psSyncCounter)[i] = 0;
index = 0;
for (unsigned int i = 0; i < atomConstraints.size(); i++) { for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psLincsAtomConstraintsIndex)[i] = index; (*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
(*psLincsAtomConstraints)[index++] = atomConstraints[i][j]; (*psLincsAtomConstraints)[i+j*gpu->natoms] = atomConstraints[i][j];
} }
(*psLincsAtomConstraintsIndex)[atomConstraints.size()] = index;
psLincsAtoms->Upload(); psLincsAtoms->Upload();
psLincsDistance->Upload(); psLincsDistance->Upload();
psLincsS->Upload(); psLincsS->Upload();
psLincsConnections->Upload(); psLincsConnections->Upload();
psLincsConnectionsIndex->Upload(); psLincsNumConnections->Upload();
psLincsAtomConstraints->Upload(); psLincsAtomConstraints->Upload();
psLincsAtomConstraintsIndex->Upload(); psLincsNumAtomConstraints->Upload();
psSyncCounter->Upload(); psSyncCounter->Upload();
gpu->sim.lincsTerms = lincsTerms; gpu->sim.lincsTerms = lincsTerms;
gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
...@@ -765,7 +765,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -765,7 +765,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.lincs_threads_per_block = 1; gpu->sim.lincs_threads_per_block = 1;
gpu->psLincsConnectionsIndex->Download(); gpu->psLincsNumConnections->Download();
gpu->psLincsConnections->Download(); gpu->psLincsConnections->Download();
gpu->psLincsCoupling->Download(); gpu->psLincsCoupling->Download();
...@@ -1131,9 +1131,9 @@ void* gpuInit(int numAtoms) ...@@ -1131,9 +1131,9 @@ void* gpuInit(int numAtoms)
gpu->psLincsAtoms = NULL; gpu->psLincsAtoms = NULL;
gpu->psLincsDistance = NULL; gpu->psLincsDistance = NULL;
gpu->psLincsConnections = NULL; gpu->psLincsConnections = NULL;
gpu->psLincsConnectionsIndex = NULL; gpu->psLincsNumConnections = NULL;
gpu->psLincsAtomConstraints = NULL; gpu->psLincsAtomConstraints = NULL;
gpu->psLincsAtomConstraintsIndex= NULL; gpu->psLincsNumAtomConstraints = NULL;
gpu->psLincsS = NULL; gpu->psLincsS = NULL;
gpu->psLincsCoupling = NULL; gpu->psLincsCoupling = NULL;
gpu->psLincsRhs1 = NULL; gpu->psLincsRhs1 = NULL;
...@@ -1281,9 +1281,9 @@ void gpuShutDown(gpuContext gpu) ...@@ -1281,9 +1281,9 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psLincsAtoms; delete gpu->psLincsAtoms;
delete gpu->psLincsDistance; delete gpu->psLincsDistance;
delete gpu->psLincsConnections; delete gpu->psLincsConnections;
delete gpu->psLincsConnectionsIndex; delete gpu->psLincsNumConnections;
delete gpu->psLincsAtomConstraints; delete gpu->psLincsAtomConstraints;
delete gpu->psLincsAtomConstraintsIndex; delete gpu->psLincsNumAtomConstraints;
delete gpu->psLincsS; delete gpu->psLincsS;
delete gpu->psLincsCoupling; delete gpu->psLincsCoupling;
delete gpu->psLincsRhs1; delete gpu->psLincsRhs1;
......
...@@ -132,9 +132,9 @@ struct _gpuContext { ...@@ -132,9 +132,9 @@ struct _gpuContext {
CUDAStream<int2>* psLincsAtoms; // The atoms connected by each LINCS constraint CUDAStream<int2>* psLincsAtoms; // The atoms connected by each LINCS constraint
CUDAStream<float4>* psLincsDistance; // The displacement vector (x, y, z) and constraint distance (w) for each LINCS constraint CUDAStream<float4>* psLincsDistance; // The displacement vector (x, y, z) and constraint distance (w) for each LINCS constraint
CUDAStream<int>* psLincsConnections; // The indices of constraints that other constraints are connected to CUDAStream<int>* psLincsConnections; // The indices of constraints that other constraints are connected to
CUDAStream<int>* psLincsConnectionsIndex; // The index in psLincsConnections at which the connections for a particular constraint start CUDAStream<int>* psLincsNumConnections; // The number of other constraints that each constraint is linked to
CUDAStream<int>* psLincsAtomConstraints; // The indices of constraints involving each atom CUDAStream<int>* psLincsAtomConstraints; // The indices of constraints involving each atom
CUDAStream<int>* psLincsAtomConstraintsIndex; // The index in psLincsAtomConstraints at which the constraints for a particular atom start CUDAStream<int>* psLincsNumAtomConstraints; // The number of constraints involving each atom
CUDAStream<float>* psLincsS; // S matrix for LINCS CUDAStream<float>* psLincsS; // S matrix for LINCS
CUDAStream<float>* psLincsCoupling; // Coupling matrix for LINCS CUDAStream<float>* psLincsCoupling; // Coupling matrix for LINCS
CUDAStream<float>* psLincsRhs1; // Workspace for LINCS CUDAStream<float>* psLincsRhs1; // Workspace for LINCS
......
...@@ -61,11 +61,11 @@ __global__ void kUpdateAtomPositions_kernel(float4* atomPositions) ...@@ -61,11 +61,11 @@ __global__ void kUpdateAtomPositions_kernel(float4* atomPositions)
{ {
float4 atomPos = atomPositions[pos]; float4 atomPos = atomPositions[pos];
float invMass = cSim.pVelm4[pos].w; float invMass = cSim.pVelm4[pos].w;
int start = cSim.pLincsAtomConstraintsIndex[pos]; int num = cSim.pLincsNumAtomConstraints[pos];
int end = cSim.pLincsAtomConstraintsIndex[pos+1]; for (int i = 0; i < num; i++)
for (int i = start; i < end; i++)
{ {
int constraint = cSim.pLincsAtomConstraints[i]; int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float4 dir = cSim.pLincsDistance[constraint]; float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint]; float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c); c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
...@@ -88,12 +88,12 @@ __global__ void kIterateLincsMatrix_kernel(int iteration) ...@@ -88,12 +88,12 @@ __global__ void kIterateLincsMatrix_kernel(int iteration)
while (pos < cSim.lincsConstraints) while (pos < cSim.lincsConstraints)
{ {
float rhs = 0.0f; float rhs = 0.0f;
int start = cSim.pLincsConnectionsIndex[pos]; int num = cSim.pLincsNumConnections[pos];
int end = cSim.pLincsConnectionsIndex[pos+1]; for (int i = 0; i < num; i++)
for (int i = start; i < end; i++)
{ {
int otherConstraint = cSim.pLincsConnections[i]; int index = pos+i*cSim.lincsConstraints;
rhs += cSim.pLincsCoupling[i]*rhs1[otherConstraint]; int otherConstraint = cSim.pLincsConnections[index];
rhs += cSim.pLincsCoupling[index]*rhs1[otherConstraint];
} }
rhs2[pos] = rhs; rhs2[pos] = rhs;
cSim.pLincsSolution[pos] += rhs; cSim.pLincsSolution[pos] += rhs;
...@@ -147,17 +147,17 @@ __global__ void kApplyLincsPart2_kernel() ...@@ -147,17 +147,17 @@ __global__ void kApplyLincsPart2_kernel()
{ {
float4 dir1 = cSim.pLincsDistance[pos]; float4 dir1 = cSim.pLincsDistance[pos];
int2 atoms1 = cSim.pLincsAtoms[pos]; int2 atoms1 = cSim.pLincsAtoms[pos];
int start = cSim.pLincsConnectionsIndex[pos]; int num = cSim.pLincsNumConnections[pos];
int end = cSim.pLincsConnectionsIndex[pos+1];
float s = cSim.pLincsS[pos]; float s = cSim.pLincsS[pos];
float invMass = cSim.pVelm4[atoms1.x].w; float invMass = cSim.pVelm4[atoms1.x].w;
for (int i = start; i < end; i++) for (int i = 0; i < num; i++)
{ {
int otherConstraint = cSim.pLincsConnections[i]; int index = pos+i*cSim.lincsConstraints;
int otherConstraint = cSim.pLincsConnections[index];
float4 dir2 = cSim.pLincsDistance[otherConstraint]; float4 dir2 = cSim.pLincsDistance[otherConstraint];
int2 atoms2 = cSim.pLincsAtoms[otherConstraint]; int2 atoms2 = cSim.pLincsAtoms[otherConstraint];
float signedMass = (atoms1.x == atoms2.x || atoms1.y == atoms2.y ? -invMass : cSim.pVelm4[atoms1.y].w); float signedMass = (atoms1.x == atoms2.x || atoms1.y == atoms2.y ? -invMass : cSim.pVelm4[atoms1.y].w);
cSim.pLincsCoupling[i] = signedMass*s*(dir1.x*dir2.x+dir1.y*dir2.y+dir1.z*dir2.z)*cSim.pLincsS[otherConstraint]; cSim.pLincsCoupling[index] = signedMass*s*(dir1.x*dir2.x+dir1.y*dir2.y+dir1.z*dir2.z)*cSim.pLincsS[otherConstraint];
} }
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment