#define WARP_SIZE 32 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #define WARP_SHUFFLE(local, index) __shfl_sync(0xffffffff, local, index) #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset) #elif defined(USE_HIP) #define WARP_SHUFFLE(local, index) __shfl(local, index) #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset) #endif #ifdef WARP_SHUFFLE_DOWN #define TEMP_SIZE WARP_SIZE #else #define TEMP_SIZE THREAD_BLOCK_SIZE #endif DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : 0; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_THREADS; } #endif return temp[0]; } KERNEL void checkSavedElectrodePositions(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT electrodePosData, GLOBAL int* RESTRICT elecToSys, GLOBAL int* RESTRICT result) { for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { real4 posqPosition = posq[elecToSys[ii]]; real4 savedPosition = electrodePosData[ii]; if (posqPosition.x != savedPosition.x || posqPosition.y != savedPosition.y || posqPosition.z != savedPosition.z) { *result = 1; break; } } } KERNEL void saveElectrodePositions(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT electrodePosData, GLOBAL int* RESTRICT elecToSys) { for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { electrodePosData[ii] = posq[elecToSys[ii]]; } } KERNEL void solve(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT capacitance #ifdef USE_CHARGE_CONSTRAINT , GLOBAL real* RESTRICT constraintVector, real chargeTarget #endif ) { // This kernel expects to be executed in a single thread block. #if CHUNK_SIZE > 1 LOCAL volatile real chunkCharges[CHUNK_SIZE]; #endif for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] = -chargeDerivatives[ii]; } SYNC_THREADS; // Cholesky solve step 1 (outer loop over chunks of rows). for (int jj = 0; jj < PADDED_PROBLEM_SIZE; jj += CHUNK_SIZE) { if (LOCAL_ID < CHUNK_SIZE) { #if CHUNK_SIZE > 1 #ifdef WARP_SHUFFLE real threadCharge = electrodeCharges[jj + LOCAL_ID]; for (int k = 0; k < CHUNK_SIZE - 1; k++) { const real chargeShuffled = WARP_SHUFFLE(threadCharge, k); if (LOCAL_ID > k) { threadCharge -= chargeShuffled * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)]; } } SYNC_WARPS; electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID] = threadCharge; #else chunkCharges[LOCAL_ID] = electrodeCharges[jj + LOCAL_ID]; for (int k = 0; k < CHUNK_SIZE - 1; k++) { SYNC_WARPS; if (LOCAL_ID > k) { chunkCharges[LOCAL_ID] -= chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)]; } } SYNC_WARPS; electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID]; #endif #endif } SYNC_THREADS; for (int ii = jj + CHUNK_SIZE + LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { #if CHUNK_SIZE > 1 real chargeOffset = 0; for (int k = 0; k < CHUNK_SIZE; k++) { chargeOffset += chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + ii]; } electrodeCharges[ii] -= chargeOffset; #else electrodeCharges[ii] -= electrodeCharges[jj] * capacitance[(mm_long) jj * PADDED_PROBLEM_SIZE + ii]; #endif } SYNC_THREADS; } for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] *= capacitance[(mm_long) ii * PADDED_PROBLEM_SIZE + ii]; } SYNC_THREADS; // Cholesky solve step 2 (outer loop over chunks of columns). for (int jj = PADDED_PROBLEM_SIZE - CHUNK_SIZE; jj >= 0; jj -= CHUNK_SIZE) { if (LOCAL_ID < CHUNK_SIZE) { #if CHUNK_SIZE > 1 #ifdef WARP_SHUFFLE real threadCharge = electrodeCharges[jj + LOCAL_ID]; for (int k = CHUNK_SIZE - 1; k >= 0; k--) { const real chargeShuffled = WARP_SHUFFLE(threadCharge, k); if (LOCAL_ID < k) { threadCharge -= chargeShuffled * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)]; } } SYNC_WARPS; electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID] = threadCharge; #else chunkCharges[LOCAL_ID] = electrodeCharges[jj + LOCAL_ID]; for (int k = CHUNK_SIZE - 1; k >= 0; k--) { SYNC_WARPS; if (LOCAL_ID < k) { chunkCharges[LOCAL_ID] -= chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + (jj + LOCAL_ID)]; } } SYNC_WARPS; electrodeCharges[jj + LOCAL_ID] = chunkCharges[LOCAL_ID]; #endif #endif } SYNC_THREADS; for (int ii = LOCAL_ID; ii < jj; ii += LOCAL_SIZE) { #if CHUNK_SIZE > 1 real chargeOffset = 0; for (int k = 0; k < CHUNK_SIZE; k++) { chargeOffset += chunkCharges[k] * capacitance[(mm_long) (jj + k) * PADDED_PROBLEM_SIZE + ii]; } electrodeCharges[ii] -= chargeOffset; #else electrodeCharges[ii] -= electrodeCharges[jj] * capacitance[(mm_long) jj * PADDED_PROBLEM_SIZE + ii]; #endif } SYNC_THREADS; } for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] *= capacitance[(mm_long) ii * PADDED_PROBLEM_SIZE + ii]; } SYNC_THREADS; #ifdef USE_CHARGE_CONSTRAINT LOCAL volatile real temp[TEMP_SIZE]; real chargeOffset = 0; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { chargeOffset -= electrodeCharges[ii]; } chargeOffset = chargeTarget + reduceValue(chargeOffset, temp); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] += chargeOffset * constraintVector[ii]; } #endif }