typedef struct { mm_ulong state, increment; float next; bool nextIsValid; } RandomState; /** * Generate a random number with PCH-XSH-RR. */ DEVICE unsigned int getRandomInt(RandomState* random) { unsigned int xs = ((random->state>>18)^random->state)>>27; unsigned int rot = random->state>>59; random->state = random->state*6364136223846793005ULL + random->increment; return (xs>>rot) | (xs<<((-rot)&31)); } /** * Generate a normally distributed random number with Box-Muller. */ DEVICE float getRandomNormal(RandomState* random) { if (random->nextIsValid) { random->nextIsValid = false; return random->next; } float scale = 1/(float) 0x100000000; float x = scale*max(getRandomInt(random), 1u); float y = scale*getRandomInt(random); float multiplier = SQRT(-2*LOG(x)); float angle = 2*M_PI*y; random->next = multiplier*COS(angle); random->nextIsValid = true; return multiplier*SIN(angle); } /** * Perform the first part of integration: velocity step. */ KERNEL void integrateDPDPart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed2* RESTRICT dt, GLOBAL int* RESTRICT tileCounter) { mixed fscale = dt[0].y/(mixed) 0x100000000; for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) { mixed4 velocity = velm[index]; if (velocity.w != 0.0) { velocity.x += fscale*velocity.w*force[index]; velocity.y += fscale*velocity.w*force[index+paddedNumAtoms]; velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2]; velm[index] = velocity; } } if (GLOBAL_ID == 0) *tileCounter = 0; } /** * Load the position of a particle. */ inline DEVICE mixed3 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) { #ifdef USE_MIXED_PRECISION real4 pos1 = posq[index]; real4 pos2 = posqCorrection[index]; return make_mixed3(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z); #else return trimTo3(posq[index]); #endif } /** * Compute the friction and noise for a pair of particles. */ DEVICE void processPair(int i, int j, real3 delta, mixed4 vel1, mixed4 vel2, int type1, int type2, int paddedNumAtoms, GLOBAL mm_ulong* RESTRICT velDelta, mixed dt, float kT, int numTypes, GLOBAL const float2* RESTRICT params, RandomState* random, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { if (vel1.w == 0 && vel2.w == 0) return; float2 pairParams = params[type1+type2*numTypes]; float friction = pairParams.x; float cutoff = pairParams.y; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real invR = RSQRT(r2); real r = r2*invR; if (r >= cutoff) return; real mass1 = (vel1.w == 0 ? 0 : 1/vel1.w); real mass2 = (vel2.w == 0 ? 0 : 1/vel2.w); real m = mass1*mass2/(mass1+mass2); real omega = 1-(r/cutoff); real vscale = EXP(-dt*2*friction*omega*omega); real noisescale = SQRT(1-vscale*vscale); real3 dir = delta*invR; real3 v = make_real3(vel2.x-vel1.x, vel2.y-vel1.y, vel2.z-vel1.z); real dv = (1-vscale)*dot(v, dir) + noisescale*SQRT(kT/m)*getRandomNormal(random); ATOMIC_ADD(&velDelta[i], (mm_ulong) realToFixedPoint(m*vel1.w*dv*dir.x)); ATOMIC_ADD(&velDelta[i+paddedNumAtoms], (mm_ulong) realToFixedPoint(m*vel1.w*dv*dir.y)); ATOMIC_ADD(&velDelta[i+2*paddedNumAtoms], (mm_ulong) realToFixedPoint(m*vel1.w*dv*dir.z)); ATOMIC_ADD(&velDelta[j], (mm_ulong) realToFixedPoint(-m*vel2.w*dv*dir.x)); ATOMIC_ADD(&velDelta[j+paddedNumAtoms], (mm_ulong) realToFixedPoint(-m*vel2.w*dv*dir.y)); ATOMIC_ADD(&velDelta[j+2*paddedNumAtoms], (mm_ulong) realToFixedPoint(-m*vel2.w*dv*dir.z)); } /** * Perform the second part of integration: position half step, then interact with heat bath. */ KERNEL void integrateDPDPart2(int numAtoms, int paddedNumAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL const mixed4* RESTRICT velm, GLOBAL mm_long* RESTRICT velDelta, GLOBAL const mixed2* RESTRICT dt, GLOBAL const int* particleType, int numTypes, GLOBAL const float2* RESTRICT params, mm_long seed, float kT, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, GLOBAL const int2* RESTRICT exclusionTiles, int numExclusionTiles, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms, GLOBAL int* RESTRICT tileCounter #ifdef USE_MIXED_PRECISION , GLOBAL const real4* RESTRICT posqCorrection #endif ) { const int totalWarps = GLOBAL_SIZE/TILE_SIZE; const int warp = GLOBAL_ID/TILE_SIZE; const int tgx = LOCAL_ID & (TILE_SIZE-1); const int tbx = LOCAL_ID - tgx; mixed halfdt = 0.5f*dt[0].y; LOCAL mixed3 localPos[WORK_GROUP_SIZE]; LOCAL mixed4 localVel[WORK_GROUP_SIZE]; LOCAL volatile int localType[WORK_GROUP_SIZE]; #ifndef USE_MIXED_PRECISION GLOBAL real4* posqCorrection = 0; #endif // Initialize the random generator for this thread. The seed is incremented each step, // and the stream ID is the global thread index. Skipping a variable number of values // also seems to be necessary to decorrelate the streams. RandomState random; random.state = 0; random.increment = (GLOBAL_ID<<1) | 1; random.nextIsValid = false; getRandomInt(&random); random.state += seed; getRandomInt(&random); for (int i = 0; i < LOCAL_ID%16; i++) getRandomInt(&random); // First loop: process fixed tiles (ones that contain exclusions). for (int tile = warp; tile < numExclusionTiles; tile += totalWarps) { const int2 tileIndices = exclusionTiles[tile]; const unsigned int x = tileIndices.x; const unsigned int y = tileIndices.y; int atom1 = x*TILE_SIZE + tgx; mixed4 vel1 = velm[atom1]; mixed3 pos1 = loadPos(posq, posqCorrection, atom1); if (vel1.w != 0) pos1 += halfdt*trimTo3(vel1); int type1 = particleType[atom1]; if (x == y) { localVel[LOCAL_ID] = vel1; localPos[LOCAL_ID] = pos1; localType[LOCAL_ID] = type1; } else { int atom2 = y*TILE_SIZE + tgx; mixed4 vel2 = velm[atom2]; mixed3 pos2 = loadPos(posq, posqCorrection, atom2); if (vel2.w != 0) pos2 += halfdt*trimTo3(vel2); localVel[LOCAL_ID] = vel2; localPos[LOCAL_ID] = pos2; localType[LOCAL_ID] = particleType[atom2]; } SYNC_WARPS; if (atom1 < numAtoms) { for (int i = 0; i < TILE_SIZE; i++) { #ifdef INTEL_WORKAROUND // Workaround for bug in Intel's OpenCL for CPUs. MEM_FENCE; #endif int atom2 = y*TILE_SIZE+i; if ((x != y || atom1 < atom2) && atom2 < numAtoms) { mixed3 pos2 = localPos[tbx+i]; real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z); #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(delta) #endif processPair(atom1, atom2, delta, vel1, localVel[tbx+i], type1, localType[tbx+i], paddedNumAtoms, (GLOBAL mm_ulong*) velDelta, dt[0].y, kT, numTypes, params, &random, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); } } } #ifdef NVIDIA_WORKAROUND SYNC_THREADS; #else SYNC_WARPS; #endif } // Second loop: process tiles from the neighbor list. unsigned int numTiles = interactionCount[0]; LOCAL int atomIndices[WORK_GROUP_SIZE]; LOCAL int nextTile[WORK_GROUP_SIZE/TILE_SIZE]; for (int tile = warp; tile < numTiles; tile += totalWarps) { if (tgx == 0) nextTile[tbx/TILE_SIZE] = ATOMIC_ADD(tileCounter, 1); SYNC_WARPS; int tileIndex = nextTile[tbx/TILE_SIZE]; int x = tiles[tileIndex]; real4 blockSizeX = blockSize[x]; bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF); int atom1 = x*TILE_SIZE + tgx; mixed4 vel1 = velm[atom1]; mixed3 pos1 = loadPos(posq, posqCorrection, atom1); if (vel1.w != 0) pos1 += halfdt*trimTo3(vel1); int type1 = particleType[atom1]; int atom2 = interactingAtoms[tileIndex*TILE_SIZE+tgx]; atomIndices[LOCAL_ID] = atom2; mixed4 vel2 = velm[atom2]; mixed3 pos2 = loadPos(posq, posqCorrection, atom2); if (vel2.w != 0) pos2 += halfdt*trimTo3(vel2); localVel[LOCAL_ID] = vel2; localPos[LOCAL_ID] = pos2; localType[LOCAL_ID] = particleType[atom2]; #ifdef USE_PERIODIC if (singlePeriodicCopy) { // The box is small enough that we can just translate all the atoms into a single periodic // box, then skip having to apply periodic boundary conditions later. real4 blockCenterX = blockCenter[x]; APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(localPos[LOCAL_ID], blockCenterX) SYNC_WARPS; if (atom1 < numAtoms) { for (int i = 0; i < TILE_SIZE; i++) { #ifdef INTEL_WORKAROUND // Workaround for bug in Intel's OpenCL for CPUs. MEM_FENCE; #endif int atom2 = atomIndices[tbx+i]; if (atom2 < numAtoms) { pos2 = localPos[tbx+i]; real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z); processPair(atom1, atom2, delta, vel1, localVel[tbx+i], type1, localType[tbx+i], paddedNumAtoms, (GLOBAL mm_ulong*) velDelta, dt[0].y, kT, numTypes, params, &random, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); } } } } else #endif { // We need to apply periodic boundary conditions separately for each interaction. SYNC_WARPS; if (atom1 < numAtoms) { for (int i = 0; i < TILE_SIZE; i++) { #ifdef INTEL_WORKAROUND // Workaround for bug in Intel's OpenCL for CPUs. MEM_FENCE; #endif int atom2 = atomIndices[tbx+i]; if (atom2 < numAtoms) { pos2 = localPos[tbx+i]; real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z); #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(delta) #endif processPair(atom1, atom2, delta, vel1, localVel[tbx+i], type1, localType[tbx+i], paddedNumAtoms, (GLOBAL mm_ulong*) velDelta, dt[0].y, kT, numTypes, params, &random, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); } } } } SYNC_WARPS; } } /** * Perform the third part of integration: update the velocities and the second position half step. */ KERNEL void integrateDPDPart3(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mm_long* RESTRICT velDelta, GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) { mixed halfdt = 0.5f*dt[0].y; mixed deltascale = 1/(mixed) 0x100000000; for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) { mixed4 velocity = velm[index]; if (velocity.w != 0.0) { mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0); velocity.x += deltascale*velDelta[index]; velocity.y += deltascale*velDelta[index+paddedNumAtoms]; velocity.z += deltascale*velDelta[index+paddedNumAtoms*2]; velm[index] = velocity; delta += make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0); posDelta[index] = delta; oldDelta[index] = delta; } } } /** * Perform the fourth part of integration: apply constraint forces to velocities, then record * the constrained positions. */ KERNEL void integrateDPDPart4(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt #ifdef USE_MIXED_PRECISION , GLOBAL real4* RESTRICT posqCorrection #endif ) { mixed invDt = 1/dt[0].y; for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) { mixed4 velocity = velm[index]; if (velocity.w != 0.0) { mixed4 delta = posDelta[index]; velocity.x += (delta.x-oldDelta[index].x)*invDt; velocity.y += (delta.y-oldDelta[index].y)*invDt; velocity.z += (delta.z-oldDelta[index].z)*invDt; velm[index] = velocity; #ifdef USE_MIXED_PRECISION real4 pos1 = posq[index]; real4 pos2 = posqCorrection[index]; mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w); #else real4 pos = posq[index]; #endif pos.x += delta.x; pos.y += delta.y; pos.z += delta.z; #ifdef USE_MIXED_PRECISION posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w); posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); #else posq[index] = pos; #endif } } }