Commit afd06645 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimized LINCS by merging kernels and implementing a syncAllThreads() function.

parent bf7a968c
......@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")
SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0;-arch=sm_11" CACHE STRING "Semi-colon delimit multiple arguments.")
SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0" CACHE STRING "Semi-colon delimit multiple arguments.")
# Search for the cuda distribution.
IF(NOT CUDA_INSTALL_PREFIX)
......
......@@ -388,7 +388,7 @@ struct cudaGmxSimulation {
float* pLincsSolution; // Workspace for LINCS
int* pLincsAtomConstraints; // The indices of constraints involving each atom
int* pLincsNumAtomConstraints; // The number of constraints involving each atom
unsigned int* pSyncCounter; // Used for global thread synchronization
short* pSyncCounter; // Used for global thread synchronization
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
......
......@@ -728,7 +728,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
gpu->psLincsSolution = psLincsSolution;
gpu->sim.pLincsSolution = psLincsSolution->_pDevData;
CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter");
CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter");
gpu->psSyncCounter = psSyncCounter;
gpu->sim.pSyncCounter = psSyncCounter->_pDevData;
gpu->sim.lincsConstraints = numLincs;
......@@ -743,7 +743,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
(*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
}
for (unsigned int i = 0; i < psSyncCounter->_length; i++)
(*psSyncCounter)[i] = 0;
(*psSyncCounter)[i] = -1;
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
......@@ -761,8 +761,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.lincs_threads_per_block < 1)
gpu->sim.lincs_threads_per_block = 1;
if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
gpu->psLincsNumConnections->Download();
......
......@@ -140,7 +140,7 @@ struct _gpuContext {
CUDAStream<float>* psLincsRhs1; // Workspace for LINCS
CUDAStream<float>* psLincsRhs2; // Workspace for LINCS
CUDAStream<float>* psLincsSolution; // Workspace for LINCS
CUDAStream<unsigned int>* psSyncCounter;// Used for global thread synchronization
CUDAStream<short>* psSyncCounter; // Used for global thread synchronization
};
typedef struct _gpuContext *gpuContext;
......
......@@ -52,36 +52,28 @@ void GetLincsSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kUpdateAtomPositions_kernel(float4* atomPositions)
/**
* Synchronize all threads across all blocks.
*/
__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
{
// Update the atom positions based on the solution to the matrix equations.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
// short* syncCounter = &cSim.pSyncCounter[newCount%2 == 0 ? 0 : gridDim.x];
__syncthreads();
if (threadIdx.x == 0)
syncCounter[blockIdx.x] = newCount;
if (threadIdx.x < gridDim.x)
{
float4 atomPos = atomPositions[pos];
float invMass = cSim.pVelm4[pos].w;
int num = cSim.pLincsNumAtomConstraints[pos];
for (int i = 0; i < num; i++)
volatile short* counter = &syncCounter[threadIdx.x];
do
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
atomPos.x += c*dir.x;
atomPos.y += c*dir.y;
atomPos.z += c*dir.z;
}
atomPositions[pos] = atomPos;
pos += blockDim.x * gridDim.x;
} while (*counter != newCount);
}
__syncthreads();
}
__global__ void kIterateLincsMatrix_kernel(int iteration)
__global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
{
// Perform one iteration of inverting the matrix.
for (unsigned int iteration = 0; iteration < cSim.lincsTerms; iteration++) {
float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -99,6 +91,31 @@ __global__ void kIterateLincsMatrix_kernel(int iteration)
cSim.pLincsSolution[pos] += rhs;
pos += blockDim.x * gridDim.x;
}
kSyncAllThreads_kernel(&cSim.pSyncCounter[iteration%2 == 0 ? 0 : gridDim.x], iteration);
}
// Update the atom positions based on the solution to the matrix equations.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
{
float4 atomPos = atomPositions[pos];
float invMass = cSim.pVelm4[pos].w;
int num = cSim.pLincsNumAtomConstraints[pos];
for (int i = 0; i < num; i++)
{
int index = pos+i*cSim.atoms;
int constraint = cSim.pLincsAtomConstraints[index];
float4 dir = cSim.pLincsDistance[constraint];
float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
atomPos.x += c*dir.x;
atomPos.y += c*dir.y;
atomPos.z += c*dir.z;
}
atomPositions[pos] = atomPos;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPosition)
......@@ -136,13 +153,11 @@ __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPositi
cSim.pLincsSolution[pos] = diff;
pos += blockDim.x * gridDim.x;
}
}
kSyncAllThreads_kernel(cSim.pSyncCounter, cSim.lincsTerms+1);
__global__ void kApplyLincsPart2_kernel()
{
// Build the coupling matrix.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.lincsConstraints)
{
float4 dir1 = cSim.pLincsDistance[pos];
......@@ -163,7 +178,7 @@ __global__ void kApplyLincsPart2_kernel()
}
}
__global__ void kApplyLincsPart3_kernel(float4* atomPositions, bool addOldPosition)
__global__ void kApplyLincsPart2_kernel(float4* atomPositions, bool addOldPosition)
{
// Correct for rotational lengthening.
......@@ -200,24 +215,12 @@ static void kApplyLincs(gpuContext gpu, float4* atomPositions, bool addOldPositi
{
kApplyLincsPart1_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart1");
kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>();
kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kSolveLincsMatrix_kernel");
kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart2");
for (int i = 0; i < gpu->sim.lincsTerms; ++i)
{
kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
LAUNCHERROR("kIterateLincsMatrix_kernel");
}
kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kUpdateAtomPositions");
kApplyLincsPart3_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
LAUNCHERROR("kApplyLincsPart3");
for (int i = 0; i < gpu->sim.lincsTerms; ++i)
{
kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
LAUNCHERROR("kIterateLincsMatrix_kernel");
}
kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kUpdateAtomPositions");
kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
LAUNCHERROR("kSolveLincsMatrix_kernel");
}
void kApplyFirstLincs(gpuContext gpu)
......
......@@ -165,7 +165,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5);
ASSERT_EQUAL_TOL(distance, dist, 1e-4);
}
integrator.step(1);
}
......
......@@ -170,7 +170,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5);
ASSERT_EQUAL_TOL(distance, dist, 1e-4);
}
integrator.step(1);
}
......
......@@ -127,7 +127,7 @@ void testConstraints() {
Vec3 p1 = state.getPositions()[particle1];
Vec3 p2 = state.getPositions()[particle2];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(distance, dist, 2e-5);
ASSERT_EQUAL_TOL(distance, dist, 1e-4);
}
double energy = state.getKineticEnergy()+state.getPotentialEnergy();
if (i == 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment