/** * Sum a value over all threads. */ DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) { const int thread = LOCAL_ID; SYNC_THREADS; temp[thread] = value; SYNC_THREADS; for (int step = 1; step < 32; step *= 2) { if (thread+step < LOCAL_SIZE && thread%(2*step) == 0) temp[thread] = temp[thread] + temp[thread+step]; SYNC_WARPS; } for (int step = 32; step < LOCAL_SIZE; step *= 2) { if (thread+step < LOCAL_SIZE && thread%(2*step) == 0) temp[thread] = temp[thread] + temp[thread+step]; SYNC_THREADS; } return temp[0]; } /** * Compute the correlation matrix, used in finding the optimal rotation. This is executed as a single work group. */ KERNEL void computeCorrelationMatrix(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT referencePos, GLOBAL const int* RESTRICT particles, GLOBAL real* buffer) { LOCAL volatile real temp[THREAD_BLOCK_SIZE]; // Compute the center of the particle positions. real3 center = make_real3(0); for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE) center += trimTo3(posq[particles[i]]); center.x = reduceValue(center.x, temp)/numParticles; center.y = reduceValue(center.y, temp)/numParticles; center.z = reduceValue(center.z, temp)/numParticles; // Compute the correlation matrix. real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE) { int index = particles[i]; real3 pos = trimTo3(posq[index]) - center; real3 refPos = trimTo3(referencePos[index]); R[0][0] += refPos.x*pos.x; R[0][1] += refPos.x*pos.y; R[0][2] += refPos.x*pos.z; R[1][0] += refPos.y*pos.x; R[1][1] += refPos.y*pos.y; R[1][2] += refPos.y*pos.z; R[2][0] += refPos.z*pos.x; R[2][1] += refPos.z*pos.y; R[2][2] += refPos.z*pos.z; } for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R[i][j] = reduceValue(R[i][j], temp); // Copy everything into the output buffer to send back to the host. if (LOCAL_ID == 0) for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) buffer[3*i+j] = R[i][j]; } /** * Apply forces based on the orientation. */ KERNEL void computeOrientationForces(int numParticles, int paddedNumAtoms, GLOBAL const real4* RESTRICT referencePos, GLOBAL const int* RESTRICT particles, const real dxdq, const real4 eigenvalues, GLOBAL const real4* RESTRICT eigenvectors, GLOBAL mm_long* RESTRICT forceBuffers) { const int3 dsIndex[4][4] = { {make_int3(0, 1, 2), make_int3(0, 2, 1), make_int3(2, 0, 0), make_int3(1, 0, 0)}, {make_int3(0, 2, 1), make_int3(0, 1, 2), make_int3(1, 0, 0), make_int3(2, 0, 0)}, {make_int3(2, 0, 0), make_int3(1, 0, 0), make_int3(0, 1, 2), make_int3(0, 2, 1)}, {make_int3(1, 0, 0), make_int3(2, 0, 0), make_int3(0, 2, 1), make_int3(0, 1, 2)} }; const int3 dsScale[4][4] = { {make_int3(1, 1, 1), make_int3(0, -1, 1), make_int3(1, 0, -1), make_int3(-1, 1, 0)}, {make_int3(0, -1, 1), make_int3(1, -1, -1), make_int3(1, 1, 0), make_int3(1, 0, 1)}, {make_int3(1, 0, -1), make_int3(1, 1, 0), make_int3(-1, 1, -1), make_int3(0, 1, 1)}, {make_int3(-1, 1, 0), make_int3(1, 0, 1), make_int3(0, 1, 1), make_int3(-1, -1, 1)} }; for (int index = GLOBAL_ID; index < numParticles; index += GLOBAL_SIZE) { int particle = particles[index]; real3 refPos = trimTo3(referencePos[particle]); real p[3] = {refPos.x, refPos.y, refPos.z}; real3 dq = make_real3(0); for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++) { real scale = ((eigenvectors[i].z*eigenvectors[0].z) / (eigenvalues.w-eigenvalues.z) + (eigenvectors[i].y*eigenvectors[0].y) / (eigenvalues.w-eigenvalues.y) + (eigenvectors[i].x*eigenvectors[0].x) / (eigenvalues.w-eigenvalues.x)) * eigenvectors[j].w; real3 ds = make_real3(dsScale[i][j].x*p[dsIndex[i][j].x], dsScale[i][j].y*p[dsIndex[i][j].y], dsScale[i][j].z*p[dsIndex[i][j].z]); dq += scale*ds; } real3 force = -dxdq*dq; forceBuffers[particle] += realToFixedPoint(force.x); forceBuffers[particle+paddedNumAtoms] += realToFixedPoint(force.y); forceBuffers[particle+2*paddedNumAtoms] += realToFixedPoint(force.z); } }