Commit 043c7b6c authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to CUDA C-SHAKE implementation

parent fe1e6ffa
...@@ -389,7 +389,6 @@ struct cudaGmxSimulation { ...@@ -389,7 +389,6 @@ struct cudaGmxSimulation {
short* pSyncCounter; // Used for global thread synchronization short* pSyncCounter; // Used for global thread synchronization
unsigned int* pRequiredIterations; // Used by SHAKE to communicate whether iteration has converged unsigned int* pRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
float* pShakeReducedMass; // The reduced mass for each SHAKE constraint float* pShakeReducedMass; // The reduced mass for each SHAKE constraint
int* pRigidClusterConstraints; // The constraints in each rigid cluster
float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster float* pRigidClusterMatrix; // The inverse constraint matrix for each rigid cluster
unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. unsigned int* pRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices. unsigned int* pRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
......
...@@ -464,7 +464,7 @@ static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster ...@@ -464,7 +464,7 @@ static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster
} }
} }
static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, const vector<int>& constraintIndices) static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, vector<int>& constraintIndices)
{ {
vector<map<int, int> > atomConstraints(firstAtom.size()); vector<map<int, int> > atomConstraints(firstAtom.size());
for (int i = 0; i < (int)constraintIndices.size(); i++) { for (int i = 0; i < (int)constraintIndices.size(); i++) {
...@@ -537,11 +537,27 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons ...@@ -537,11 +537,27 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
} }
} }
// Reorder the constraints so those in a cluster are sequential.
vector<int> constraintOrder(constraintIndices.size());
vector<int> clusterStartIndex(rigidClusters.size());
set<int> inCluster;
int nextIndex = 0;
for (int i = 0; i < (int) rigidClusters.size(); ++i) {
clusterStartIndex[i] = nextIndex;
for (int j = 0; j < (int) rigidClusters[i].size(); ++j) {
int constraint = rigidClusters[i][j];
constraintOrder[nextIndex++] = constraint;
inCluster.insert(constraint);
}
}
for (int i = 0; i < (int) constraintIndices.size(); ++i)
if (inCluster.find(constraintIndices[i]) == inCluster.end())
constraintOrder[nextIndex++] = constraintIndices[i];
constraintIndices = constraintOrder;
// Build the CUDA streams. // Build the CUDA streams.
CUDAStream<int>* psRigidClusterConstraints = new CUDAStream<int>(totalConstraints, 1, "RigidClusterConstraints");
gpu->psRigidClusterConstraints = psRigidClusterConstraints;
gpu->sim.pRigidClusterConstraints = psRigidClusterConstraints->_pDevData;
CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex"); CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex");
gpu->psRigidClusterConstraintIndex = psRigidClusterConstraintIndex; gpu->psRigidClusterConstraintIndex = psRigidClusterConstraintIndex;
gpu->sim.pRigidClusterConstraintIndex = psRigidClusterConstraintIndex->_pDevData; gpu->sim.pRigidClusterConstraintIndex = psRigidClusterConstraintIndex->_pDevData;
...@@ -556,8 +572,7 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons ...@@ -556,8 +572,7 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
for (unsigned int i = 0; i < rigidClusters.size(); i++) { for (unsigned int i = 0; i < rigidClusters.size(); i++) {
vector<int>& cluster = rigidClusters[i]; vector<int>& cluster = rigidClusters[i];
(*psRigidClusterConstraintIndex)[i] = constraintIndex; (*psRigidClusterConstraintIndex)[i] = constraintIndex;
for (unsigned int j = 0; j < cluster.size(); j++) constraintIndex += cluster.size();
(*psRigidClusterConstraints)[constraintIndex++] = cluster[j];
if (cluster.size() > maxClusterSize) if (cluster.size() > maxClusterSize)
maxClusterSize = cluster.size(); maxClusterSize = cluster.size();
} }
...@@ -567,9 +582,6 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons ...@@ -567,9 +582,6 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
gpu->sim.clusterShakeBlockSize = 1; gpu->sim.clusterShakeBlockSize = 1;
while (gpu->sim.clusterShakeBlockSize < maxClusterSize) while (gpu->sim.clusterShakeBlockSize < maxClusterSize)
gpu->sim.clusterShakeBlockSize *= 2; gpu->sim.clusterShakeBlockSize *= 2;
if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
psRigidClusterConstraints->Upload();
psRigidClusterConstraintIndex->Upload(); psRigidClusterConstraintIndex->Upload();
gpu->hasInitializedRigidClusters = false; gpu->hasInitializedRigidClusters = false;
} }
...@@ -776,6 +788,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -776,6 +788,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
for (unsigned i = 0; i < atom1.size(); i++) for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]]) if (!isShakeAtom[atom1[i]])
lincsConstraints.push_back(i); lincsConstraints.push_back(i);
// Identify rigid clusters of atoms.
findRigidClusters(gpu, atom1, atom2, lincsConstraints);
// Record the connections between constraints.
int numLincs = (int) lincsConstraints.size(); int numLincs = (int) lincsConstraints.size();
vector<vector<int> > atomConstraints(gpu->natoms); vector<vector<int> > atomConstraints(gpu->natoms);
for (int i = 0; i < numLincs; i++) { for (int i = 0; i < numLincs; i++) {
...@@ -859,8 +878,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -859,8 +878,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
(*psSyncCounter)[i] = -1; (*psSyncCounter)[i] = -1;
for (unsigned int i = 0; i < atomConstraints.size(); i++) { for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psLincsNumAtomConstraints)[i] = atomConstraints[i].size(); (*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
(*psLincsAtomConstraints)[i+j*gpu->natoms] = atomConstraints[i][j]; bool forward = (atom1[lincsConstraints[atomConstraints[i][j]]] == i);
(*psLincsAtomConstraints)[i+j*gpu->natoms] = (forward ? atomConstraints[i][j]+1 : -atomConstraints[i][j]-1);
}
} }
psLincsAtoms->Upload(); psLincsAtoms->Upload();
psLincsDistance->Upload(); psLincsDistance->Upload();
...@@ -877,10 +898,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -877,10 +898,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.lincs_threads_per_block = gpu->sim.threads_per_block; gpu->sim.lincs_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks) if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
gpu->sim.lincs_threads_per_block = gpu->sim.blocks; gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
// Identify rigid clusters of atoms. gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
findRigidClusters(gpu, atom1, atom2, lincsConstraints);
// count number of atoms w/o constraint // count number of atoms w/o constraint
...@@ -1250,7 +1269,6 @@ void* gpuInit(int numAtoms) ...@@ -1250,7 +1269,6 @@ void* gpuInit(int numAtoms)
gpu->psSyncCounter = NULL; gpu->psSyncCounter = NULL;
gpu->psRequiredIterations = NULL; gpu->psRequiredIterations = NULL;
gpu->psShakeReducedMass = NULL; gpu->psShakeReducedMass = NULL;
gpu->psRigidClusterConstraints = NULL;
gpu->psRigidClusterConstraintIndex = NULL; gpu->psRigidClusterConstraintIndex = NULL;
gpu->psRigidClusterMatrix = NULL; gpu->psRigidClusterMatrix = NULL;
gpu->psRigidClusterMatrixIndex = NULL; gpu->psRigidClusterMatrixIndex = NULL;
...@@ -1406,7 +1424,6 @@ void gpuShutDown(gpuContext gpu) ...@@ -1406,7 +1424,6 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psSyncCounter; delete gpu->psSyncCounter;
delete gpu->psRequiredIterations; delete gpu->psRequiredIterations;
delete gpu->psShakeReducedMass; delete gpu->psShakeReducedMass;
delete gpu->psRigidClusterConstraints;
delete gpu->psRigidClusterConstraintIndex; delete gpu->psRigidClusterConstraintIndex;
delete gpu->psRigidClusterMatrix; delete gpu->psRigidClusterMatrix;
delete gpu->psRigidClusterMatrixIndex; delete gpu->psRigidClusterMatrixIndex;
......
...@@ -139,7 +139,6 @@ struct _gpuContext { ...@@ -139,7 +139,6 @@ struct _gpuContext {
CUDAStream<short>* psSyncCounter; // Used for global thread synchronization CUDAStream<short>* psSyncCounter; // Used for global thread synchronization
CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
CUDAStream<float>* psShakeReducedMass; // The reduced mass for each SHAKE constraint CUDAStream<float>* psShakeReducedMass; // The reduced mass for each SHAKE constraint
CUDAStream<int>* psRigidClusterConstraints; // The constraints in each rigid cluster
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices. CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
......
...@@ -148,14 +148,13 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -148,14 +148,13 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize; unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
while (block < cSim.rigidClusters) while (block < cSim.rigidClusters)
{ {
unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block]; unsigned int firstConstraint = cSim.pRigidClusterConstraintIndex[block];
unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex; unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstConstraint;
if (indexInBlock < blockSize) if (indexInBlock < blockSize)
{ {
// Load the constraint forces and matrix. // Load the constraint forces and matrix.
unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock]; temp[threadIdx.x] = cSim.pLincsSolution[firstConstraint+indexInBlock];
temp[threadIdx.x] = cSim.pLincsSolution[constraint];
unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block]; unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
// Multiply by the matrix. // Multiply by the matrix.
...@@ -163,7 +162,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -163,7 +162,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
float sum = 0.0f; float sum = 0.0f;
for (unsigned int i = 0; i < blockSize; i++) for (unsigned int i = 0; i < blockSize; i++)
sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock]; sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
cSim.pLincsSolution[constraint] = sum; cSim.pLincsSolution[firstConstraint+indexInBlock] = sum;
} }
block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize; block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
} }
...@@ -173,6 +172,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -173,6 +172,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
// Update the position of each atom. // Update the position of each atom.
pos = threadIdx.x + blockIdx.x * blockDim.x; pos = threadIdx.x + blockIdx.x * blockDim.x;
float damping = (iteration < 2 ? 0.5f : 1.0f);
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
float4 atomPos = atomPositions[pos]; float4 atomPos = atomPositions[pos];
...@@ -182,8 +182,10 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -182,8 +182,10 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
{ {
int index = pos+i*cSim.atoms; int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index]; int constraint = cSim.pLincsAtomConstraints[index];
float constraintForce = invMass*cSim.pLincsSolution[constraint]; bool forward = (constraint > 0);
constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce); constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*cSim.pLincsSolution[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = cSim.pLincsDistance[constraint]; float4 dir = cSim.pLincsDistance[constraint];
atomPos.x += constraintForce*dir.x; atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y; atomPos.y += constraintForce*dir.y;
...@@ -202,12 +204,14 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition) ...@@ -202,12 +204,14 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
cSim.pSyncCounter[blockIdx.x] = -1; cSim.pSyncCounter[blockIdx.x] = -1;
} }
static void initInverseMatrices(gpuContext gpu) static void initInverseMatrices(gpuContext gpu, bool useNewPositions)
{ {
// Build the inverse constraint matrix for each cluster. // Build the inverse constraint matrix for each cluster.
gpu->psPosq4->Download(); gpu->psPosq4->Download();
gpu->psVelm4->Download(); gpu->psVelm4->Download();
if (useNewPositions)
gpu->psPosqP4->Download();
unsigned int elementIndex = 0; unsigned int elementIndex = 0;
for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) { for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) {
// Compute the constraint coupling matrix for this cluster. // Compute the constraint coupling matrix for this cluster.
...@@ -217,9 +221,24 @@ static void initInverseMatrices(gpuContext gpu) ...@@ -217,9 +221,24 @@ static void initInverseMatrices(gpuContext gpu)
unsigned int size = endIndex-startIndex; unsigned int size = endIndex-startIndex;
vector<float3> r(size); vector<float3> r(size);
for (unsigned int j = 0; j < size; j++) { for (unsigned int j = 0; j < size; j++) {
int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]]; int2 atoms = (*gpu->psLincsAtoms)[startIndex+j];
float4 pos1 = (*gpu->psPosq4)[atoms.x]; float4 pos1, pos2;
float4 pos2 = (*gpu->psPosq4)[atoms.y]; if (useNewPositions) {
float4 oldpos1 = (*gpu->psPosq4)[atoms.x];
float4 oldpos2 = (*gpu->psPosq4)[atoms.y];
pos1 = (*gpu->psPosqP4)[atoms.x];
pos2 = (*gpu->psPosqP4)[atoms.y];
pos1.x += oldpos1.x;
pos1.y += oldpos1.y;
pos1.z += oldpos1.z;
pos2.x += oldpos2.x;
pos2.y += oldpos2.y;
pos2.z += oldpos2.z;
}
else {
pos1 = (*gpu->psPosq4)[atoms.x];
pos2 = (*gpu->psPosq4)[atoms.y];
}
r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z); r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z); float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z);
r[j].x *= invLength; r[j].x *= invLength;
...@@ -228,11 +247,9 @@ static void initInverseMatrices(gpuContext gpu) ...@@ -228,11 +247,9 @@ static void initInverseMatrices(gpuContext gpu)
} }
Array2D<double> matrix(size, size); Array2D<double> matrix(size, size);
for (int j = 0; j < (int)size; j++) { for (int j = 0; j < (int)size; j++) {
int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j]; int2 atomsj = (*gpu->psLincsAtoms)[startIndex+j];
int2 atomsj = (*gpu->psLincsAtoms)[constraintj];
for (int k = 0; k < (int)size; k++) { for (int k = 0; k < (int)size; k++) {
int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k]; int2 atomsk = (*gpu->psLincsAtoms)[startIndex+k];
int2 atomsk = (*gpu->psLincsAtoms)[constraintk];
float invMassj0 = (*gpu->psVelm4)[atomsj.x].w; float invMassj0 = (*gpu->psVelm4)[atomsj.x].w;
float invMassj1 = (*gpu->psVelm4)[atomsj.y].w; float invMassj1 = (*gpu->psVelm4)[atomsj.y].w;
double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z; double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z;
...@@ -275,10 +292,10 @@ static void initInverseMatrices(gpuContext gpu) ...@@ -275,10 +292,10 @@ static void initInverseMatrices(gpuContext gpu)
(*gpu->psRigidClusterMatrixIndex)[i] = elementIndex; (*gpu->psRigidClusterMatrixIndex)[i] = elementIndex;
for (int j = 0; j < (int)size; j++) for (int j = 0; j < (int)size; j++)
{ {
float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w; float distance1 = (*gpu->psLincsDistance)[startIndex+j].w;
for (int k = 0; k < (int)size; k++) for (int k = 0; k < (int)size; k++)
{ {
float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w; float distance2 = (*gpu->psLincsDistance)[startIndex+k].w;
(*gpu->psRigidClusterMatrix)[elementIndex++] = (float)(matrix[k][j]*distance1/distance2); (*gpu->psRigidClusterMatrix)[elementIndex++] = (float)(matrix[k][j]*distance1/distance2);
} }
} }
...@@ -286,7 +303,6 @@ static void initInverseMatrices(gpuContext gpu) ...@@ -286,7 +303,6 @@ static void initInverseMatrices(gpuContext gpu)
(*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex; (*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex;
gpu->psRigidClusterMatrix->Upload(); gpu->psRigidClusterMatrix->Upload();
gpu->psRigidClusterMatrixIndex->Upload(); gpu->psRigidClusterMatrixIndex->Upload();
gpu->hasInitializedRigidClusters = true;
} }
void kApplyFirstCShake(gpuContext gpu) void kApplyFirstCShake(gpuContext gpu)
...@@ -295,9 +311,20 @@ void kApplyFirstCShake(gpuContext gpu) ...@@ -295,9 +311,20 @@ void kApplyFirstCShake(gpuContext gpu)
if (gpu->sim.lincsConstraints > 0) if (gpu->sim.lincsConstraints > 0)
{ {
if (!gpu->hasInitializedRigidClusters) if (!gpu->hasInitializedRigidClusters)
initInverseMatrices(gpu); {
// Build preliminary constraint matrices for use on this call.
initInverseMatrices(gpu, false);
}
kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true); kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
LAUNCHERROR("kApplyCShake"); LAUNCHERROR("kApplyCShake");
if (!gpu->hasInitializedRigidClusters)
{
// Rebuild the constraint matrices, now that we know all constraints are really satisfied.
initInverseMatrices(gpu, true);
gpu->hasInitializedRigidClusters = true;
}
} }
} }
......
...@@ -101,9 +101,11 @@ __global__ void kSolveLincsMatrix_kernel(float4* atomPositions) ...@@ -101,9 +101,11 @@ __global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
{ {
int index = pos+i*cSim.atoms; int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index]; int constraint = cSim.pLincsAtomConstraints[index];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float4 dir = cSim.pLincsDistance[constraint]; float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint]; float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c); c = (forward ? -c : c);
atomPos.x += c*dir.x; atomPos.x += c*dir.x;
atomPos.y += c*dir.y; atomPos.y += c*dir.y;
atomPos.z += c*dir.z; atomPos.z += c*dir.z;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment