#define WARP_SIZE 32 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset) #elif defined(USE_HIP) #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset) #endif #ifdef WARP_SHUFFLE_DOWN #define TEMP_SIZE WARP_SIZE #else #define TEMP_SIZE THREAD_BLOCK_SIZE #endif typedef struct { real gradStepSq, qStepGradStep, qStepGrad, q, qStep; } BlockSums1; typedef struct { real projGradSq, precGradStep, precGrad; } BlockSums2; // Sum value from each thread (using temp). Use real type variables (float on // single and mixed precision modes, double on double precision mode). DEVICE real reduceReal(real value, LOCAL_ARG volatile real* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : 0; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_THREADS; } #endif return temp[0]; } // Performs the equivalent of reduceReal() on 5 values simultaneously. DEVICE BlockSums1 reduceBlockSums1(BlockSums1 value, LOCAL_ARG BlockSums1* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value.gradStepSq += WARP_SHUFFLE_DOWN(value.gradStepSq, step); value.qStepGradStep += WARP_SHUFFLE_DOWN(value.qStepGradStep, step); value.qStepGrad += WARP_SHUFFLE_DOWN(value.qStepGrad, step); value.q += WARP_SHUFFLE_DOWN(value.q, step); value.qStep += WARP_SHUFFLE_DOWN(value.qStep, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value.gradStepSq = value.qStepGradStep = value.qStepGrad = value.q = value.qStep = 0; if (lane < warpCount) { value = temp[lane]; } for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value.gradStepSq += WARP_SHUFFLE_DOWN(value.gradStepSq, step); value.qStepGradStep += WARP_SHUFFLE_DOWN(value.qStepGradStep, step); value.qStepGrad += WARP_SHUFFLE_DOWN(value.qStepGrad, step); value.q += WARP_SHUFFLE_DOWN(value.q, step); value.qStep += WARP_SHUFFLE_DOWN(value.qStep, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread].gradStepSq += temp[thread + step].gradStepSq; temp[thread].qStepGradStep += temp[thread + step].qStepGradStep; temp[thread].qStepGrad += temp[thread + step].qStepGrad; temp[thread].q += temp[thread + step].q; temp[thread].qStep += temp[thread + step].qStep; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread].gradStepSq += temp[thread + step].gradStepSq; temp[thread].qStepGradStep += temp[thread + step].qStepGradStep; temp[thread].qStepGrad += temp[thread + step].qStepGrad; temp[thread].q += temp[thread + step].q; temp[thread].qStep += temp[thread + step].qStep; } SYNC_THREADS; } #endif return temp[0]; } // Performs the equivalent of reduceReal() on 3 values simultaneously. DEVICE BlockSums2 reduceBlockSums2(BlockSums2 value, LOCAL_ARG BlockSums2* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value.projGradSq += WARP_SHUFFLE_DOWN(value.projGradSq, step); value.precGradStep += WARP_SHUFFLE_DOWN(value.precGradStep, step); value.precGrad += WARP_SHUFFLE_DOWN(value.precGrad, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value.projGradSq = value.precGradStep = value.precGrad = 0; if (lane < warpCount) { value = temp[lane]; } for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value.projGradSq += WARP_SHUFFLE_DOWN(value.projGradSq, step); value.precGradStep += WARP_SHUFFLE_DOWN(value.precGradStep, step); value.precGrad += WARP_SHUFFLE_DOWN(value.precGrad, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread].projGradSq += temp[thread + step].projGradSq; temp[thread].precGradStep += temp[thread + step].precGradStep; temp[thread].precGrad += temp[thread + step].precGrad; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread].projGradSq += temp[thread + step].projGradSq; temp[thread].precGradStep += temp[thread + step].precGradStep; temp[thread].precGrad += temp[thread + step].precGrad; } SYNC_THREADS; } #endif return temp[0]; } // We need more than single precision for accumulation regardless of the mode // selected, so use double if double precision is supported, and otherwise use // double-float arithmetic. In the latter case, use float2 with .x storing the // "high" part and .y storing the "low" part of each double-float number. #ifdef SUPPORTS_DOUBLE_PRECISION #define ACCUM double #define ACCUM_ZERO 0.0 // Perform accum = accum + real. #define ACCUM_ADD(x, y) ((x) + (ACCUM) (y)) // Perform real = real + accum. #define ACCUM_APPLY(x, y) ((real) ((ACCUM) (x) + (y))) // Perform real = accum * real. #define ACCUM_MUL(x, y) ((real) ((x) * (ACCUM) (y))) // Perform accum = (accum * real) + accum. #define ACCUM_MUL_ADD(x, y, z) (((x) * (ACCUM) (y)) + (z)) // Perform real = (real + accum) * accum. #define ACCUM_ADD_MUL(x, y, z) ((real) (((ACCUM) (x) + (y)) * (z))) // Sum value from each thread (using temp) and return (sum + offset) * scale. DEVICE ACCUM reduceAccum(ACCUM value, LOCAL_ARG volatile ACCUM* temp, real offset, real scale) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : 0; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_THREADS; } #endif return (temp[0] + offset) * scale; } #else #define ACCUM float2 #define ACCUM_ZERO make_float2(0.0f, 0.0f) #define ACCUM_ADD(x, y) compensatedAdd2(x, y) #define ACCUM_APPLY(x, y) compensatedAdd1(y, x) #define ACCUM_MUL(x, y) compensatedMultiply1(x, y) #define ACCUM_MUL_ADD(x, y, z) compensatedAdd3(compensatedMultiply2(x, y), z) #define ACCUM_ADD_MUL(x, y, z) compensatedMultiply3(compensatedAdd2(y, x), z) // For details of the compensated addition and multiplication implemented, see // Joldes et al., ACM Trans. Math. Softw. 2017, 44, 15res (DOI: 10.1145/3121432). // float + float -> float2, only valid if the floating-point exponent of a is // not less than that of b. DEVICE inline float2 compensatedAddKernel1(float a, float b) { float s = a + b; return make_float2(s, b - (s - a)); } // float + float -> float2, valid for any inputs. DEVICE inline float2 compensatedAddKernel2(float a, float b) { float s = a + b; float c = s - b; float d = s - c; return make_float2(s, (a - c) + (b - d)); } // float * float -> float2. DEVICE inline float2 compensatedMultiplyKernel(float a, float b) { float c = a * b; return make_float2(c, FMA(a, b, -c)); } // float2 + float -> float. Like compensatedAdd2, but only computes the high // part of the result. DEVICE inline float compensatedAdd1(float2 x, float y) { float2 s = compensatedAddKernel2(x.x, y); return s.x + (x.y + s.y); } // float2 + float -> float2, with a relative error of 2^-47. DEVICE inline float2 compensatedAdd2(float2 x, float y) { float2 s = compensatedAddKernel2(x.x, y); float v = x.y + s.y; return compensatedAddKernel1(s.x, v); } // float2 + float2 -> float2, with a relative error of 2^-46. DEVICE inline float2 compensatedAdd3(float2 x, float2 y) { float2 s = compensatedAddKernel2(x.x, y.x); float2 t = compensatedAddKernel2(x.y, y.y); float c = s.y + t.x; float2 v = compensatedAddKernel1(s.x, c); float w = t.y + v.y; return compensatedAddKernel1(v.x, w); } // float2 * float -> float. Like compensatedMultiply2, but only computes the // high part of the result. DEVICE inline float compensatedMultiply1(float2 x, float y) { float c = x.x * y; return c + (FMA(x.x, y, -c) + x.y * y); } // float2 * float -> float2, with a relative error of 2^-47. DEVICE inline float2 compensatedMultiply2(float2 x, float y) { float c = x.x * y; return compensatedAddKernel1(c, FMA(x.x, y, -c) + x.y * y); } // float2 * float2 -> float. DEVICE inline float compensatedMultiply3(float2 x, float2 y) { float2 c = compensatedMultiplyKernel(x.x, y.x); return c.x + (c.y + FMA(x.y, y.x, FMA(x.x, y.y, x.y * y.y))); } // Sum value from each thread (using temp) and return (sum + offset) * scale. DEVICE ACCUM reduceAccum(ACCUM value, LOCAL_ARG volatile ACCUM* temp, real offset, real scale) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value = compensatedAdd3(value, WARP_SHUFFLE_DOWN(value, step)); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : ACCUM_ZERO; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value = compensatedAdd3(value, WARP_SHUFFLE_DOWN(value, step)); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] = compensatedAdd3(temp[thread], temp[thread + step]); } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if(thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] = compensatedAdd3(temp[thread], temp[thread + step]); } SYNC_THREADS; } #endif return compensatedMultiply2(compensatedAdd2(temp[0], offset), scale); } #endif KERNEL void solveInitializeStep1(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT qLast #ifdef USE_CHARGE_CONSTRAINT , real chargeTarget #endif ) { // This kernel expects to be executed in a single thread block. #ifdef USE_CHARGE_CONSTRAINT LOCAL volatile ACCUM tempAccum[TEMP_SIZE]; #endif // Set initial guess charges as linear extrapolations from the current and // previous charges fed through the solver, and save the current charges as // the previous charges. for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { const real qGuess = electrodeCharges[ii]; electrodeCharges[ii] = 2 * qGuess - qLast[ii]; qLast[ii] = qGuess; } #ifdef USE_CHARGE_CONSTRAINT // Ensure that initial guess charges satisfy the constraint. ACCUM offsetAccum = ACCUM_ZERO; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { offsetAccum = ACCUM_ADD(offsetAccum, -electrodeCharges[ii]); } const ACCUM offset = reduceAccum(offsetAccum, tempAccum, chargeTarget, 1 / (real) NUM_ELECTRODE_PARTICLES); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] = ACCUM_APPLY(electrodeCharges[ii], offset); } #endif } KERNEL void solveInitializeStep2(GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT projGrad, GLOBAL int* RESTRICT convergedResult) { // This kernel expects to be executed in a single thread block. LOCAL volatile real temp[TEMP_SIZE]; #ifdef USE_CHARGE_CONSTRAINT LOCAL volatile ACCUM tempAccum[TEMP_SIZE]; #endif for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { grad[ii] = chargeDerivatives[ii]; } #ifdef USE_CHARGE_CONSTRAINT // Project the initial gradient without preconditioning. ACCUM offsetAccum = ACCUM_ZERO; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { offsetAccum = ACCUM_ADD(offsetAccum, grad[ii]); } const ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1 / (real) NUM_ELECTRODE_PARTICLES); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { projGrad[ii] = ACCUM_APPLY(grad[ii], offset); } #else for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { projGrad[ii] = grad[ii]; } #endif // Check for convergence at the initial guess charges. real error = 0; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { error += projGrad[ii] * projGrad[ii]; } error = reduceReal(error, temp); if (LOCAL_ID == 0) { convergedResult[0] = (int) (error <= ERROR_TARGET); } } KERNEL void solveInitializeStep3(GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT projGrad, GLOBAL real* RESTRICT precGrad, GLOBAL real* RESTRICT qStep, GLOBAL real* RESTRICT grad0 #ifdef PRECOND_REQUESTED , GLOBAL ACCUM* RESTRICT precondVector, int precondActivated #endif ) { // This kernel expects to be executed in a single thread block. #if defined(PRECOND_REQUESTED) && defined(USE_CHARGE_CONSTRAINT) LOCAL volatile ACCUM tempAccum[TEMP_SIZE]; #endif for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { grad0[ii] = chargeDerivatives[ii]; } #ifdef PRECOND_REQUESTED // Project the initial gradient with preconditioning. if (precondActivated) { #ifdef USE_CHARGE_CONSTRAINT ACCUM offsetAccum = ACCUM_ZERO; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { offsetAccum = ACCUM_MUL_ADD(precondVector[ii], grad[ii], offsetAccum); } const ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = ACCUM_ADD_MUL(grad[ii], offset, precondVector[ii]); } #else for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = ACCUM_MUL(precondVector[ii], grad[ii]); } #endif } else { #endif for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = projGrad[ii]; } #ifdef PRECOND_REQUESTED } #endif // Initialize step vector for conjugate gradient iterations. for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { electrodeCharges[ii] = qStep[ii] = -precGrad[ii]; } } KERNEL void solveLoopStep1( GLOBAL real* RESTRICT chargeDerivatives, GLOBAL real* RESTRICT q, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT qStep, GLOBAL real* RESTRICT gradStep, GLOBAL real* RESTRICT grad0, GLOBAL BlockSums1* RESTRICT blockSums1Buffer ) { // This kernel can be executed across multiple thread blocks. LOCAL BlockSums1 temp[TEMP_SIZE]; for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { gradStep[ii] = chargeDerivatives[ii] - grad0[ii]; } // Reduce values within each block and store results. BlockSums1 blockSums1 = {0, 0, 0, 0, 0}; for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { blockSums1.gradStepSq += gradStep[ii] * gradStep[ii]; blockSums1.qStepGradStep += qStep[ii] * gradStep[ii]; blockSums1.qStepGrad += qStep[ii] * grad[ii]; blockSums1.q += q[ii]; blockSums1.qStep += qStep[ii]; } blockSums1 = reduceBlockSums1(blockSums1, temp); if (LOCAL_ID == 0) { blockSums1Buffer[GROUP_ID] = blockSums1; } } KERNEL void solveLoopStep2( GLOBAL BlockSums1* RESTRICT blockSums1Buffer, GLOBAL int* RESTRICT convergedResult ) { // This kernel expects to be executed in a single thread block. LOCAL BlockSums1 tempSums[TEMP_SIZE]; // Reduce values from all blocks. BlockSums1 blockSums1 = {0, 0, 0, 0, 0}; for (int ii = LOCAL_ID; ii < THREAD_BLOCK_COUNT; ii += LOCAL_SIZE) { blockSums1.gradStepSq += blockSums1Buffer[ii].gradStepSq; blockSums1.qStepGradStep += blockSums1Buffer[ii].qStepGradStep; blockSums1.qStepGrad += blockSums1Buffer[ii].qStepGrad; blockSums1.q += blockSums1Buffer[ii].q; blockSums1.qStep += blockSums1Buffer[ii].qStep; } blockSums1 = reduceBlockSums1(blockSums1, tempSums); if (LOCAL_ID == 0) { blockSums1Buffer[0] = blockSums1; // If A qStep is small enough, stop to prevent, e.g., division by zero // in the calculation of alpha, or too large step sizes. convergedResult[0] = (int) (blockSums1.gradStepSq <= ERROR_TARGET); } } KERNEL void solveLoopStep3( GLOBAL real* RESTRICT q, GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT qStep, GLOBAL real* RESTRICT gradStep, GLOBAL BlockSums1* RESTRICT blockSums1Buffer, GLOBAL int* RESTRICT convergedResult #ifdef USE_CHARGE_CONSTRAINT , real chargeTarget #endif ) { // This kernel can be executed across multiple thread blocks. if (convergedResult[0] != 0) { return; } const BlockSums1 blockSums1 = blockSums1Buffer[0]; const real alpha = -blockSums1.qStepGrad / blockSums1.qStepGradStep; // Update the charge vector. for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { q[ii] += alpha * qStep[ii]; } #ifdef USE_CHARGE_CONSTRAINT // Remove any accumulated drift from the charge vector. This would be zero // in exact arithmetic, but error can accumulate over time in finite // precision. const real offset = (chargeTarget - (blockSums1.q + alpha * blockSums1.qStep)) / NUM_ELECTRODE_PARTICLES; for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { q[ii] += offset; } #endif // Update the gradient vector. If on this iteration, the gradient is to be // recomputed, the contents of grad will be overwritten. for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { grad[ii] += alpha * gradStep[ii]; } } KERNEL void solveLoopStep4( GLOBAL real* RESTRICT grad, GLOBAL real* RESTRICT projGrad, GLOBAL real* RESTRICT precGrad, GLOBAL real* RESTRICT gradStep, GLOBAL BlockSums2* RESTRICT blockSums2Buffer, GLOBAL int* RESTRICT convergedResult #ifdef PRECOND_REQUESTED , GLOBAL ACCUM* RESTRICT precondVector, int precondActivated #endif ) { // This kernel expects to be executed in a single thread block. if (convergedResult[0] != 0) { return; } LOCAL volatile real temp[TEMP_SIZE]; LOCAL BlockSums2 tempSums[TEMP_SIZE]; #ifdef USE_CHARGE_CONSTRAINT LOCAL volatile ACCUM tempAccum[TEMP_SIZE]; #endif // Project the current gradient without preconditioning. #ifdef USE_CHARGE_CONSTRAINT ACCUM offsetAccum = ACCUM_ZERO; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { offsetAccum = ACCUM_ADD(offsetAccum, grad[ii]); } ACCUM offset = reduceAccum(offsetAccum, tempAccum, 0, -1 / (real) NUM_ELECTRODE_PARTICLES); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { projGrad[ii] = ACCUM_APPLY(grad[ii], offset); } #else for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { projGrad[ii] = grad[ii]; } #endif // Project the current gradient with preconditioning. #ifdef PRECOND_REQUESTED if (precondActivated) { #ifdef USE_CHARGE_CONSTRAINT offsetAccum = ACCUM_ZERO; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { offsetAccum = ACCUM_MUL_ADD(precondVector[ii], grad[ii], offsetAccum); } offset = reduceAccum(offsetAccum, tempAccum, 0, -1); for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = ACCUM_ADD_MUL(grad[ii], offset, precondVector[ii]); } #else for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = ACCUM_MUL(precondVector[ii], grad[ii]); } #endif } else { #endif for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { precGrad[ii] = projGrad[ii]; } #ifdef PRECOND_REQUESTED } #endif // Reduce values to be used by all blocks in the final kernel. BlockSums2 blockSums2 = {0, 0, 0}; for (int ii = LOCAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += LOCAL_SIZE) { blockSums2.projGradSq += projGrad[ii] * projGrad[ii]; blockSums2.precGradStep += precGrad[ii] * gradStep[ii]; blockSums2.precGrad += precGrad[ii]; } blockSums2 = reduceBlockSums2(blockSums2, tempSums); if (LOCAL_ID == 0) { blockSums2Buffer[0] = blockSums2; convergedResult[0] = (int) (blockSums2.projGradSq <= ERROR_TARGET); } } KERNEL void solveLoopStep5( GLOBAL real* RESTRICT electrodeCharges, GLOBAL real* RESTRICT precGrad, GLOBAL real* RESTRICT qStep, GLOBAL BlockSums1* RESTRICT blockSums1Buffer, GLOBAL BlockSums2* RESTRICT blockSums2Buffer, GLOBAL int* RESTRICT convergedResult ) { // This kernel can be executed across multiple thread blocks. if (convergedResult[0] != 0) { return; } const BlockSums1 blockSums1 = blockSums1Buffer[0]; const BlockSums2 blockSums2 = blockSums2Buffer[0]; // Evaluate the conjugate gradient parameter beta. const real beta = blockSums2.precGradStep / blockSums1.qStepGradStep; // Update the step vector. for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { qStep[ii] = beta * qStep[ii] - precGrad[ii]; } #ifdef USE_CHARGE_CONSTRAINT // Project out any deviation off of the constraint plane from the step // vector. This would be zero in exact arithmetic, but error can accumulate // over time in finite precision. const real offset = (beta * blockSums1.qStep - blockSums2.precGrad) / NUM_ELECTRODE_PARTICLES; for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { qStep[ii] -= offset; } #endif // Prepare for the next derivative calculation. for (int ii = GLOBAL_ID; ii < NUM_ELECTRODE_PARTICLES; ii += GLOBAL_SIZE) { electrodeCharges[ii] = qStep[ii]; } }