Commit afd06645 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimized LINCS by merging kernels and implementing a syncAllThreads() function.

parent bf7a968c
...@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation") ...@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation") ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")
SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.") SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0;-arch=sm_11" CACHE STRING "Semi-colon delimit multiple arguments.") SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0" CACHE STRING "Semi-colon delimit multiple arguments.")
# Search for the cuda distribution. # Search for the cuda distribution.
IF(NOT CUDA_INSTALL_PREFIX) IF(NOT CUDA_INSTALL_PREFIX)
......
...@@ -388,7 +388,7 @@ struct cudaGmxSimulation { ...@@ -388,7 +388,7 @@ struct cudaGmxSimulation {
float* pLincsSolution; // Workspace for LINCS float* pLincsSolution; // Workspace for LINCS
int* pLincsAtomConstraints; // The indices of constraints involving each atom int* pLincsAtomConstraints; // The indices of constraints involving each atom
int* pLincsNumAtomConstraints; // The number of constraints involving each atom int* pLincsNumAtomConstraints; // The number of constraints involving each atom
unsigned int* pSyncCounter; // Used for global thread synchronization short* pSyncCounter; // Used for global thread synchronization
// Mutable stuff // Mutable stuff
float4* pPosq; // Pointer to atom positions and charges float4* pPosq; // Pointer to atom positions and charges
......
...@@ -728,7 +728,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -728,7 +728,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution"); CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
gpu->psLincsSolution = psLincsSolution; gpu->psLincsSolution = psLincsSolution;
gpu->sim.pLincsSolution = psLincsSolution->_pDevData; gpu->sim.pLincsSolution = psLincsSolution->_pDevData;
CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter"); CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter");
gpu->psSyncCounter = psSyncCounter; gpu->psSyncCounter = psSyncCounter;
gpu->sim.pSyncCounter = psSyncCounter->_pDevData; gpu->sim.pSyncCounter = psSyncCounter->_pDevData;
gpu->sim.lincsConstraints = numLincs; gpu->sim.lincsConstraints = numLincs;
...@@ -743,7 +743,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -743,7 +743,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
(*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j]; (*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
} }
for (unsigned int i = 0; i < psSyncCounter->_length; i++) for (unsigned int i = 0; i < psSyncCounter->_length; i++)
(*psSyncCounter)[i] = 0; (*psSyncCounter)[i] = -1;
for (unsigned int i = 0; i < atomConstraints.size(); i++) { for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psLincsNumAtomConstraints)[i] = atomConstraints[i].size(); (*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
...@@ -761,8 +761,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -761,8 +761,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block) if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block; gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.lincs_threads_per_block < 1) if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
gpu->sim.lincs_threads_per_block = 1; gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
gpu->psLincsNumConnections->Download(); gpu->psLincsNumConnections->Download();
......
...@@ -140,7 +140,7 @@ struct _gpuContext { ...@@ -140,7 +140,7 @@ struct _gpuContext {
CUDAStream<float>* psLincsRhs1; // Workspace for LINCS CUDAStream<float>* psLincsRhs1; // Workspace for LINCS
CUDAStream<float>* psLincsRhs2; // Workspace for LINCS CUDAStream<float>* psLincsRhs2; // Workspace for LINCS
CUDAStream<float>* psLincsSolution; // Workspace for LINCS CUDAStream<float>* psLincsSolution; // Workspace for LINCS
CUDAStream<unsigned int>* psSyncCounter;// Used for global thread synchronization CUDAStream<short>* psSyncCounter; // Used for global thread synchronization
}; };
typedef struct _gpuContext *gpuContext; typedef struct _gpuContext *gpuContext;
......
...@@ -52,36 +52,28 @@ void GetLincsSim(gpuContext gpu) ...@@ -52,36 +52,28 @@ void GetLincsSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kUpdateAtomPositions_kernel(float4* atomPositions) /**
* Synchronize all threads across all blocks.
*/
__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
{ {
// Update the atom positions based on the solution to the matrix equations. // short* syncCounter = &cSim.pSyncCounter[newCount%2 == 0 ? 0 : gridDim.x];
__syncthreads();
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; if (threadIdx.x == 0)
while (pos < cSim.atoms) syncCounter[blockIdx.x] = newCount;
if (threadIdx.x < gridDim.x)
{ {
float4 atomPos = atomPositions[pos]; volatile short* counter = &syncCounter[threadIdx.x];
float invMass = cSim.pVelm4[pos].w; do
int num = cSim.pLincsNumAtomConstraints[pos];
for (int i = 0; i < num; i++)
{ {
int index = pos+i*cSim.atoms; } while (*counter != newCount);
int constraint = cSim.pLincsAtomConstraints[index];
float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
atomPos.x += c*dir.x;
atomPos.y += c*dir.y;
atomPos.z += c*dir.z;
}
atomPositions[pos] = atomPos;
pos += blockDim.x * gridDim.x;
} }
__syncthreads();
} }
__global__ void kIterateLincsMatrix_kernel(int iteration) __global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
{ {
// Perform one iteration of inverting the matrix. for (unsigned int iteration = 0; iteration < cSim.lincsTerms; iteration++) {
float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2); float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1); float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -99,6 +91,31 @@ __global__ void kIterateLincsMatrix_kernel(int iteration) ...@@ -99,6 +91,31 @@ __global__ void kIterateLincsMatrix_kernel(int iteration)
cSim.pLincsSolution[pos] += rhs; cSim.pLincsSolution[pos] += rhs;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
kSyncAllThreads_kernel(&cSim.pSyncCounter[iteration%2 == 0 ? 0 : gridDim.x], iteration);
}
// Update the atom positions based on the solution to the matrix equations.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
{
float4 atomPos = atomPositions[pos];
float invMass = cSim.pVelm4[pos].w;
int num = cSim.pLincsNumAtomConstraints[pos];
for (int i = 0; i < num; i++)
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
atomPos.x += c*dir.x;
atomPos.y += c*dir.y;
atomPos.z += c*dir.z;
}
atomPositions[pos] = atomPos;
pos += blockDim.x * gridDim.x;
}
} }
__global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPosition) __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPosition)
...@@ -136,13 +153,11 @@ __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPositi ...@@ -136,13 +153,11 @@ __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPositi
cSim.pLincsSolution[pos] = diff; cSim.pLincsSolution[pos] = diff;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
} kSyncAllThreads_kernel(cSim.pSyncCounter, cSim.lincsTerms+1);
__global__ void kApplyLincsPart2_kernel()
{
// Build the coupling matrix. // Build the coupling matrix.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.lincsConstraints) while (pos < cSim.lincsConstraints)
{ {
float4 dir1 = cSim.pLincsDistance[pos]; float4 dir1 = cSim.pLincsDistance[pos];
...@@ -163,7 +178,7 @@ __global__ void kApplyLincsPart2_kernel() ...@@ -163,7 +178,7 @@ __global__ void kApplyLincsPart2_kernel()
} }
} }
__global__ void kApplyLincsPart3_kernel(float4* atomPositions, bool addOldPosition) __global__ void kApplyLincsPart2_kernel(float4* atomPositions, bool addOldPosition)
{ {
// Correct for rotational lengthening. // Correct for rotational lengthening.
...@@ -200,24 +215,12 @@ static void kApplyLincs(gpuContext gpu, float4* atomPositions, bool addOldPositi ...@@ -200,24 +215,12 @@ static void kApplyLincs(gpuContext gpu, float4* atomPositions, bool addOldPositi
{ {
kApplyLincsPart1_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition); kApplyLincsPart1_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart1"); LAUNCHERROR("kApplyLincsPart1");
kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(); kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kSolveLincsMatrix_kernel");
kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart2"); LAUNCHERROR("kApplyLincsPart2");
for (int i = 0; i < gpu->sim.lincsTerms; ++i) kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
{ LAUNCHERROR("kSolveLincsMatrix_kernel");
kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
LAUNCHERROR("kIterateLincsMatrix_kernel");
}
kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kUpdateAtomPositions");
kApplyLincsPart3_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart3");
for (int i = 0; i < gpu->sim.lincsTerms; ++i)
{
kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
LAUNCHERROR("kIterateLincsMatrix_kernel");
}
kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kUpdateAtomPositions");
} }
void kApplyFirstLincs(gpuContext gpu) void kApplyFirstLincs(gpuContext gpu)
......
...@@ -165,7 +165,7 @@ void testConstraints() { ...@@ -165,7 +165,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1]; Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2]; Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2])); double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5); ASSERT_EQUAL_TOL(distance, dist, 1e-4);
} }
integrator.step(1); integrator.step(1);
} }
......
...@@ -170,7 +170,7 @@ void testConstraints() { ...@@ -170,7 +170,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1]; Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2]; Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2])); double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5); ASSERT_EQUAL_TOL(distance, dist, 1e-4);
} }
integrator.step(1); integrator.step(1);
} }
......
...@@ -127,7 +127,7 @@ void testConstraints() { ...@@ -127,7 +127,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1]; Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2]; Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2])); double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5); ASSERT_EQUAL_TOL(distance, dist, 1e-4);
} }
double energy = state.getKineticEnergy()+state.getPotentialEnergy(); double energy = state.getKineticEnergy()+state.getPotentialEnergy();
if (i == 1) if (i == 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment