#define LS_DOT_START 0 #define LS_DOT 1 #define LS_STEP 2 #define LS_FAIL 0 #define LS_SUCCEED 1 #define LS_CONTINUE 2 #define WARP_SIZE 32 #ifdef USE_MIXED_PRECISION // When mixed is double precision, use double precision sqrt and fabs to // avoid overflow. #define SQRT_MIXED sqrt #define FABS_MIXED fabs #else #define SQRT_MIXED SQRT #define FABS_MIXED FABS #endif #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset) #elif defined(USE_HIP) #define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset) #endif #ifdef WARP_SHUFFLE_DOWN #define TEMP_SIZE WARP_SIZE #else #define TEMP_SIZE THREAD_BLOCK_SIZE #endif DEVICE void resetLineSearchData(GLOBAL mixed* RESTRICT lineSearchData) { if (GLOBAL_ID == 0) { lineSearchData[LS_DOT_START] = 0; lineSearchData[LS_STEP] = 1; } } DEVICE mixed reduceAdd(mixed value, LOCAL_ARG volatile mixed* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : 0; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value += WARP_SHUFFLE_DOWN(value, step); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] += temp[thread + step]; } SYNC_THREADS; } #endif return temp[0]; } DEVICE mixed reduceMax(mixed value, LOCAL_ARG volatile mixed* temp) { const int thread = LOCAL_ID; SYNC_THREADS; #ifdef WARP_SHUFFLE_DOWN const int warpCount = LOCAL_SIZE / WARP_SIZE; const int warp = thread / WARP_SIZE; const int lane = thread % WARP_SIZE; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value = max(value, WARP_SHUFFLE_DOWN(value, step)); } if (!lane) { temp[warp] = value; } SYNC_THREADS; if (!warp) { value = lane < warpCount ? temp[lane] : 0; for (int step = WARP_SIZE / 2; step > 0; step >>= 1) { value = max(value, WARP_SHUFFLE_DOWN(value, step)); } if (!lane) { temp[0] = value; } } SYNC_THREADS; #else temp[thread] = value; SYNC_THREADS; for (int step = 1; step < WARP_SIZE / 2; step <<= 1) { if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] = max(temp[thread], temp[thread + step]); } SYNC_WARPS; } for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) { if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) { temp[thread] = max(temp[thread], temp[thread + step]); } SYNC_THREADS; } #endif return temp[0]; } DEVICE void atomicAddMixed(GLOBAL mixed* RESTRICT target, const mixed value) { #if defined(__CUDA_ARCH__) || defined(USE_HIP) // Do atomic addition of floating-point values directly atomicAdd(target, value); #elif defined(__OPENCL_VERSION__) #if defined(USE_MIXED_PRECISION) || defined(USE_DOUBLE_PRECISION) // Do 64-bit atomic compare/exchange (requires cl_khr_int64_base_atomics; // this will be checked on the host before trying to compile this kernel) unsigned long old = as_ulong(*target); unsigned long check; do { check = old; old = atom_cmpxchg((GLOBAL unsigned long*)target, check, as_ulong(value + as_double(check))); } while(check != old); #else // Do 32-bit atomic compare/exchange unsigned int old = as_uint(*target); unsigned int check; do { check = old; old = atomic_cmpxchg((GLOBAL unsigned int*)target, check, as_uint(value + as_float(check))); } while(check != old); #endif #else #error "Internal error: atomicAddMixed is missing an implementation for this platform" #endif } KERNEL void recordInitialPos( GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT order, GLOBAL mixed* RESTRICT xInit, GLOBAL mixed* RESTRICT x, const int numParticles #ifdef USE_MIXED_PRECISION , GLOBAL const real4* RESTRICT posqCorrection #endif ) { for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) { int offset = 3 * order[i]; #ifdef USE_MIXED_PRECISION real4 pos1 = posq[i]; real4 pos2 = posqCorrection[i]; mixed3 pos = make_mixed3(pos1.x + (mixed) pos2.x, pos1.y + (mixed) pos2.y, pos1.z + (mixed) pos2.z); #else real4 pos = posq[i]; #endif xInit[offset] = x[offset] = pos.x; xInit[offset + 1] = x[offset + 1] = pos.y; xInit[offset + 2] = x[offset + 2] = pos.z; } } KERNEL void restorePos( GLOBAL real4* RESTRICT posq, GLOBAL const int* RESTRICT order, GLOBAL const mixed* RESTRICT x, GLOBAL mixed* RESTRICT returnValue, const int numParticles #ifdef USE_MIXED_PRECISION , GLOBAL real4* RESTRICT posqCorrection #endif ) { for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) { int offset = 3 * order[i]; posq[i].x = x[offset]; posq[i].y = x[offset + 1]; posq[i].z = x[offset + 2]; #ifdef USE_MIXED_PRECISION posqCorrection[i].x = x[offset] - (real) x[offset]; posqCorrection[i].y = x[offset + 1] - (real) x[offset + 1]; posqCorrection[i].z = x[offset + 2] - (real) x[offset + 2]; #endif } // Reset in case we will accumulate constraint energies in returnValue. if (GLOBAL_ID == 0) { *returnValue = 0; } } KERNEL void convertForces( GLOBAL const mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT forceBuffer, GLOBAL const int* RESTRICT order, GLOBAL mixed* RESTRICT grad, GLOBAL mixed* RESTRICT returnValue, const int numParticles, const int numPadded ) { const mm_long limit = 0x4000000000000000; const mixed scale = -1 / (mixed) 0x100000000; for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) { int offset = 3 * order[i]; if (velm[i].w == 0) { grad[offset] = 0; grad[offset + 1] = 0; grad[offset + 2] = 0; } else { mm_long fx = forceBuffer[i]; mm_long fy = forceBuffer[i + numPadded]; mm_long fz = forceBuffer[i + 2 * numPadded]; if (fx < -limit || fx > limit || fy < -limit || fy > limit || fz < -limit || fz > limit) { *returnValue = FLT_MAX; } grad[offset] = scale * fx; grad[offset + 1] = scale * fy; grad[offset + 2] = scale * fz; } } } KERNEL void getConstraintEnergyForces( GLOBAL mm_ulong* RESTRICT forceBuffer, GLOBAL const int* RESTRICT order, GLOBAL const int2* RESTRICT constraintIndices, GLOBAL const mixed* RESTRICT constraintDistances, GLOBAL const mixed* RESTRICT x, GLOBAL mixed* RESTRICT returnValue, const int numPadded, const int numConstraints, const mixed kRestraint ) { LOCAL volatile mixed temp[TEMP_SIZE]; const mixed scale = 0x100000000; mixed energy = 0; for (int i = GLOBAL_ID; i < numConstraints; i += GLOBAL_SIZE) { int2 indices = constraintIndices[i]; mixed distance = constraintDistances[i]; int offset1 = 3 * order[indices.x]; int offset2 = 3 * order[indices.y]; mixed3 delta = make_mixed3(x[offset2] - x[offset1], x[offset2 + 1] - x[offset1 + 1], x[offset2 + 2] - x[offset1 + 2]); mixed r2 = dot(delta, delta); mixed r = SQRT_MIXED(r2); delta *= 1 / r; mixed dr = r - distance; mixed kdr = kRestraint * dr; energy += (mixed) 0.5 * kdr * dr; mm_long fx = (mm_long) (kdr * scale * delta.x); mm_long fy = (mm_long) (kdr * scale * delta.y); mm_long fz = (mm_long) (kdr * scale * delta.z); ATOMIC_ADD(&forceBuffer[indices.x], (mm_ulong) fx); ATOMIC_ADD(&forceBuffer[indices.x + numPadded], (mm_ulong) fy); ATOMIC_ADD(&forceBuffer[indices.x + 2 * numPadded], (mm_ulong) fz); ATOMIC_ADD(&forceBuffer[indices.y], (mm_ulong) -fx); ATOMIC_ADD(&forceBuffer[indices.y + numPadded], (mm_ulong) -fy); ATOMIC_ADD(&forceBuffer[indices.y + 2 * numPadded], (mm_ulong) -fz); } energy = reduceAdd(energy, temp); if (LOCAL_ID == 0) { atomicAddMixed(returnValue, energy); } } KERNEL void getConstraintError( GLOBAL const int2* RESTRICT constraintIndices, GLOBAL const mixed* RESTRICT constraintDistances, GLOBAL const mixed* RESTRICT x, GLOBAL mixed* RESTRICT returnValue, const int numConstraints ) { // This kernel is expected to be executed in a single thread block. LOCAL volatile mixed temp[TEMP_SIZE]; mixed maxError = 0; for (int i = LOCAL_ID; i < numConstraints; i += LOCAL_SIZE) { int2 indices = constraintIndices[i]; mixed distance = constraintDistances[i]; mixed3 delta = make_mixed3(x[3 * indices.y] - x[3 * indices.x], x[3 * indices.y + 1] - x[3 * indices.x + 1], x[3 * indices.y + 2] - x[3 * indices.x + 2]); mixed r = SQRT_MIXED(dot(delta, delta)); maxError = max(maxError, FABS_MIXED(r - distance) / distance); } maxError = reduceMax(maxError, temp); if (LOCAL_ID == 0) { *returnValue = maxError; } } KERNEL void initializeDir( GLOBAL const mixed* RESTRICT grad, GLOBAL mixed* RESTRICT dir, GLOBAL const mixed* RESTRICT gradNorm, GLOBAL mixed* RESTRICT lineSearchData, const int numVariables ) { const real scale = -1 / *gradNorm; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] = scale * grad[i]; } // Prepare for the first line search of the optimization. resetLineSearchData(lineSearchData); } KERNEL void gradNorm( GLOBAL const mixed* RESTRICT grad, GLOBAL mixed* RESTRICT gradNorm, const int numVariables ) { // This kernel is expected to be executed in a single thread block. // This is a slow path for when the gradient could overflow. LOCAL volatile mixed temp[TEMP_SIZE]; mixed gradScale = 1; for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) { gradScale = max(gradScale, FABS_MIXED(grad[i])); } gradScale = reduceMax(gradScale, temp); mixed gradScaleInv = 1 / gradScale; mixed scaledNorm = 0; for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) { mixed gradScaled = gradScaleInv * grad[i]; scaledNorm += gradScaled * gradScaled; } scaledNorm = reduceAdd(scaledNorm, temp); if (LOCAL_ID == 0) { *gradNorm = gradScale * SQRT_MIXED(scaledNorm); } } KERNEL void getDiff( GLOBAL const mixed* RESTRICT x, GLOBAL const mixed* RESTRICT xPrev, GLOBAL const mixed* RESTRICT grad, GLOBAL const mixed* RESTRICT gradPrev, GLOBAL mixed* RESTRICT scale, GLOBAL mixed* RESTRICT xDiff, GLOBAL mixed* RESTRICT gradDiff, GLOBAL int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT returnValue, GLOBAL const mixed* RESTRICT gradNorm, const int numVariables, const mixed tolerance, const int end, const int largeGrad ) { LOCAL volatile mixed temp[TEMP_SIZE]; // If the convergence condition is satisfied, update returnFlag. After this // kernel has run, it is safe to start downloading returnFlag. bool converged; if (largeGrad) { // The slow path for overflow was run: gradNorm is the norm. converged = (*gradNorm <= tolerance); } else { // The fast path using atomics was run: gradNorm is the square norm. converged = (*gradNorm <= tolerance * tolerance); } if (GLOBAL_ID == 0) { *returnFlag = (int) converged; } if (converged) { return; } const int endOffset = numVariables * end; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { xDiff[endOffset + i] = x[i] - xPrev[i]; gradDiff[endOffset + i] = grad[i] - gradPrev[i]; } if (GLOBAL_ID == 0 && !largeGrad) { // The next kernel will use atomics to do its reduction, so clear the // destination variables first. scale[end] = 0; *returnValue = 0; } } KERNEL void getScale( GLOBAL mixed* RESTRICT alpha, GLOBAL mixed* RESTRICT scale, GLOBAL const mixed* RESTRICT xDiff, GLOBAL const mixed* RESTRICT gradDiff, GLOBAL const int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT returnValue, const int numVariables, const int end, const int largeGrad ) { // This kernel is expected to be executed in a single thread block if // largeGrad is non-zero (if we are using the fallback for large values). LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } const int endOffset = numVariables * end; if (largeGrad) { mixed gradScale = 1; for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) { gradScale = max(gradScale, FABS_MIXED(gradDiff[endOffset + i])); } gradScale = reduceMax(gradScale, temp); mixed gradScaleInv = 1 / gradScale; mixed xGradScaled = 0; mixed gradGradScaled = 0; for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) { mixed gradDiffScaled = gradScaleInv * gradDiff[endOffset + i]; xGradScaled += xDiff[endOffset + i] * gradDiffScaled; gradGradScaled += gradDiffScaled * gradDiffScaled; } xGradScaled = reduceAdd(xGradScaled, temp); gradGradScaled = reduceAdd(gradGradScaled, temp); if (LOCAL_ID == 0) { scale[end] = xGradScaled * gradScale; *returnValue = xGradScaled * gradScaleInv / gradGradScaled; } } else { mixed xGrad = 0; mixed gradGrad = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { xGrad += xDiff[endOffset + i] * gradDiff[endOffset + i]; gradGrad += gradDiff[endOffset + i] * gradDiff[endOffset + i]; } xGrad = reduceAdd(xGrad, temp); gradGrad = reduceAdd(gradGrad, temp); if (LOCAL_ID == 0) { atomicAddMixed(&scale[end], xGrad); atomicAddMixed(returnValue, gradGrad); } } // Clear all values of alpha (plus the extra slot at alpha[NUM_VECTORS]) // to prepare them for use in reductions by subsequent kernels. for (int i = LOCAL_ID; i <= NUM_VECTORS; i += LOCAL_SIZE) { alpha[i] = 0; } } KERNEL void reinitializeDir( GLOBAL const mixed* RESTRICT grad, GLOBAL mixed* RESTRICT dir, GLOBAL mixed* RESTRICT alpha, GLOBAL const mixed* RESTRICT scale, GLOBAL const mixed* RESTRICT xDiff, GLOBAL const int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT returnValue, const int numVariables, const int vectorIndex, const int largeGrad ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } if (GLOBAL_ID == 0 && !largeGrad) { // The last kernel used atomics to do its reduction. returnValue has // gradGrad, so finish up by replacing it with xGrad / gradGrad. *returnValue = scale[vectorIndex] / *returnValue; } const int indexOffset = numVariables * vectorIndex; mixed vectorAlpha = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] = -grad[i]; vectorAlpha -= grad[i] * xDiff[indexOffset + i]; } vectorAlpha = reduceAdd(vectorAlpha, temp); if (LOCAL_ID == 0) { atomicAddMixed(&alpha[vectorIndex], vectorAlpha / scale[vectorIndex]); } } KERNEL void updateDirAlpha( GLOBAL mixed* dir, GLOBAL mixed* RESTRICT alpha, GLOBAL const mixed* RESTRICT scale, GLOBAL const mixed* RESTRICT xDiff, GLOBAL const mixed* RESTRICT gradDiff, GLOBAL const int* RESTRICT returnFlag, const int numVariables, const int vectorIndex1 ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } const int vectorIndex2 = (vectorIndex1 ? vectorIndex1 : NUM_VECTORS) - 1; const int indexOffset1 = numVariables * vectorIndex1; const int indexOffset2 = numVariables * vectorIndex2; const mixed vectorScale = alpha[vectorIndex1]; mixed vectorAlpha = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] -= vectorScale * gradDiff[indexOffset1 + i]; vectorAlpha += dir[i] * xDiff[indexOffset2 + i]; } vectorAlpha = reduceAdd(vectorAlpha, temp); if (LOCAL_ID == 0) { atomicAddMixed(&alpha[vectorIndex2], vectorAlpha / scale[vectorIndex2]); } } KERNEL void scaleDir( GLOBAL mixed* dir, GLOBAL mixed* RESTRICT alpha, GLOBAL const mixed* RESTRICT scale, GLOBAL const mixed* RESTRICT gradDiff, GLOBAL const int* RESTRICT returnFlag, GLOBAL const mixed* RESTRICT returnValue, const int numVariables, const int vectorIndex ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } const int indexOffset = numVariables * vectorIndex; const mixed innerScale = alpha[vectorIndex]; const mixed outerScale = *returnValue; mixed vectorBeta = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] = (dir[i] - innerScale * gradDiff[indexOffset + i]) * outerScale; vectorBeta += dir[i] * gradDiff[indexOffset + i]; } vectorBeta = reduceAdd(vectorBeta, temp); if (LOCAL_ID == 0) { // Store the result in an extra slot alpha[NUM_VECTORS] instead of using // alpha[vectorIndex] since other blocks might still need to read it. atomicAddMixed(&alpha[NUM_VECTORS], (GLOBAL_ID == 0 ? innerScale : 0) - vectorBeta / scale[vectorIndex]); } } KERNEL void updateDirBeta( GLOBAL mixed* dir, GLOBAL mixed* RESTRICT alpha, GLOBAL const mixed* RESTRICT scale, GLOBAL const mixed* RESTRICT xDiff, GLOBAL const mixed* RESTRICT gradDiff, GLOBAL const int* RESTRICT returnFlag, const int numVariables, const int vectorIndex1, const int vectorIndexAlpha ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } const int vectorIndex2 = vectorIndex1 == NUM_VECTORS - 1 ? 0 : vectorIndex1 + 1; const int indexOffset1 = numVariables * vectorIndex1; const int indexOffset2 = numVariables * vectorIndex2; const mixed vectorScale = alpha[vectorIndexAlpha]; mixed vectorBeta = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] += vectorScale * xDiff[indexOffset1 + i]; vectorBeta += dir[i] * gradDiff[indexOffset2 + i]; } vectorBeta = reduceAdd(vectorBeta, temp); if (LOCAL_ID == 0) { atomicAddMixed(&alpha[vectorIndex2], -vectorBeta / scale[vectorIndex2]); } } KERNEL void updateDirFinal( GLOBAL mixed* dir, GLOBAL const mixed* RESTRICT alpha, GLOBAL const mixed* RESTRICT xDiff, GLOBAL const int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT lineSearchData, const int numVariables, const int vectorIndex, const int vectorIndexAlpha ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag) { return; } const int indexOffset = numVariables * vectorIndex; const mixed vectorScale = alpha[vectorIndexAlpha]; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { dir[i] += vectorScale * xDiff[indexOffset + i]; } // Prepare for a line search in the next iteration. resetLineSearchData(lineSearchData); } KERNEL void lineSearchSetup( GLOBAL const mixed* RESTRICT x, GLOBAL mixed* RESTRICT xPrev, GLOBAL const mixed* RESTRICT grad, GLOBAL mixed* RESTRICT gradPrev, GLOBAL const mixed* RESTRICT dir, GLOBAL int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT gradNorm, GLOBAL mixed* RESTRICT lineSearchData, const int numVariables ) { LOCAL volatile mixed temp[TEMP_SIZE]; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { xPrev[i] = x[i]; } for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { gradPrev[i] = grad[i]; } mixed result = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { result += grad[i] * dir[i]; } result = reduceAdd(result, temp); if (LOCAL_ID == 0) { atomicAddMixed(&lineSearchData[LS_DOT_START], result); } if (GLOBAL_ID == 0) { *returnFlag = LS_CONTINUE; *gradNorm = 0; } } KERNEL void lineSearchStep( GLOBAL mixed* RESTRICT x, GLOBAL const mixed* RESTRICT xPrev, GLOBAL mixed* RESTRICT grad, GLOBAL const mixed* RESTRICT gradPrev, GLOBAL const mixed* RESTRICT dir, GLOBAL int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT gradNorm, GLOBAL mixed* RESTRICT lineSearchData, GLOBAL mixed* RESTRICT lineSearchDataBackup, const int numVariables ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag == LS_FAIL) { return; } if (*returnFlag == LS_SUCCEED) { // The strong Wolfe condition was satisfied on the last iteration. // Instead of taking another step, start evaluating the gradient. If // we evaluated on the CPU, it might overflow, but we will throw out // this result and recompute it in that case. mixed norm = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { norm += grad[i] * grad[i]; } norm = reduceAdd(norm, temp); if (LOCAL_ID == 0) { atomicAddMixed(gradNorm, norm); } return; } // We are either getting started or going around for another normal line // search iteration, so check to see whether the initial search direction is // bad, or the updated step is bad. if (lineSearchData[LS_DOT_START] > 0 || lineSearchData[LS_STEP] < LBFGS_MIN_STEP || lineSearchData[LS_STEP] > LBFGS_MAX_STEP) { if (GLOBAL_ID == 0) { *returnFlag = LS_FAIL; } return; } const mixed step = lineSearchData[LS_STEP]; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { x[i] = xPrev[i] + step * dir[i]; } if (GLOBAL_ID == 0) { // Back up the line search data in case we need to restore it after // reevaluating energy and forces on the CPU, and reset LS_DOT. lineSearchDataBackup[LS_DOT_START] = lineSearchData[LS_DOT_START]; lineSearchDataBackup[LS_DOT] = lineSearchData[LS_DOT] = 0; lineSearchDataBackup[LS_STEP] = lineSearchData[LS_STEP]; } } KERNEL void lineSearchDot( GLOBAL const mixed* RESTRICT grad, GLOBAL const mixed* RESTRICT dir, GLOBAL mixed* RESTRICT lineSearchData, GLOBAL int* RESTRICT returnFlag, GLOBAL const mixed* RESTRICT returnValue, const int numVariables, mixed deltaEnergy ) { LOCAL volatile mixed temp[TEMP_SIZE]; if (*returnFlag == LS_FAIL) { return; } // Any restraint energy in returnValue hasn't been downloaded yet to be // passed back up in the energy parameter, so add it in here. deltaEnergy += *returnValue; // The energy may be such that we don't need to do a dot product and can // immediately decide to scale the step, so mark this case with LS_SUCCEED. // This will be checked in the following kernel. if (!(FABS_MIXED(deltaEnergy) < FLT_MAX) || deltaEnergy > lineSearchData[LS_STEP] * LBFGS_FTOL * lineSearchData[LS_DOT_START]) { if (GLOBAL_ID == 0) { *returnFlag = LS_SUCCEED; } return; } mixed result = 0; for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) { result += grad[i] * dir[i]; } result = reduceAdd(result, temp); if (LOCAL_ID == 0) { atomicAddMixed(&lineSearchData[LS_DOT], result); } } KERNEL void lineSearchContinue( GLOBAL int* RESTRICT returnFlag, GLOBAL mixed* RESTRICT gradNorm, GLOBAL mixed* RESTRICT lineSearchData ) { // This kernel should run a single thread and just lets us update the step // and decide how to continue the line search without returning to the CPU. if (GLOBAL_ID != 0 || *returnFlag == LS_FAIL) { return; } *gradNorm = 0; // If the last kernel set LS_SUCCEED, the energy was out of range and we // should scale the step back and go around again. if (*returnFlag == LS_SUCCEED) { lineSearchData[LS_STEP] *= LBFGS_SCALE_DOWN; *returnFlag = LS_CONTINUE; return; } // If we are here, we should be reading LS_CONTINUE that was set by // lineSearchSetup, so either scale the step or set LS_SUCCEED if the line // search is successful and we should exit. if (lineSearchData[LS_DOT] < LBFGS_WOLFE * lineSearchData[LS_DOT_START]) { lineSearchData[LS_STEP] *= LBFGS_SCALE_UP; } else if(lineSearchData[LS_DOT] > -LBFGS_WOLFE * lineSearchData[LS_DOT_START]) { lineSearchData[LS_STEP] *= LBFGS_SCALE_DOWN; } else { *returnFlag = LS_SUCCEED; } }