Implemented mixed precision mode

18fb6efc · Peter Eastman · 76716f1c · 18fb6efc · 18fb6efc · 18fb6efc
Commit 18fb6efc authored Jul 10, 2012 by Peter Eastman
6 changed files
--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -1977,7 +1977,20 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
        defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
-        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::gbsaObc1, defines);
+        map<string, string> replacements;
+        stringstream defineAccum;
+        if (cu.getAccumulateInDouble()) {
+            defineAccum << "typedef double accum;\n";
+            defineAccum << "typedef double4 accum4;\n";
+            defines["make_accum4"] = "make_double4";
+        }
+        else {
+            defineAccum << "typedef real accum;\n";
+            defineAccum << "typedef real4 accum4;\n";
+            defines["make_accum4"] = "make_real4";
+        }
+        replacements["DEFINE_ACCUM"] = defineAccum.str();
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
        computeBornSumKernel = cu.getKernel(module, "computeBornSum");
        computeSumArgs.push_back(&bornSum->getDevicePointer());
        computeSumArgs.push_back(&cu.getPosq().getDevicePointer());
@@ -2422,7 +2435,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        map<string, string> replacements;
        string n2EnergyStr = n2EnergySource.str();
        replacements["COMPUTE_INTERACTION"] = n2EnergyStr;
-        stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
+        stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2;
        if (force.getNumGlobalParameters() > 0)
            extraArgs << ", const float* globals";
        pairEnergyUsesParam.resize(params->getBuffers().size(), false);
@@ -2459,15 +2472,13 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        extraArgs << ", unsigned long long* __restrict__ derivBuffers";
        for (int i = 0; i < force.getNumComputedValues(); i++) {
            string index = cu.intToString(i+1);
-            atomParams << "real deriv" << index << ";\n";
+            atomParams << "accum deriv" << index << ";\n";
            clearLocal << "localData[localAtomIndex].deriv" << index << " = 0;\n";
-            declare1 << "real deriv" << index << "_1 = 0;\n";
+            declare1 << "accum deriv" << index << "_1 = 0;\n";
            load2 << "real deriv" << index << "_2 = 0;\n";
            recordDeriv << "localData[atom2].deriv" << index << " += deriv" << index << "_2;\n";
            storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
            storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
-            declareTemps << "__local real tempDerivBuffer" << index << "[64];\n";
-            setTemps << "tempDerivBuffer" << index << "[threadIdx.x] = deriv" << index << "_1;\n";
            atomParamSize++;
        }
        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
@@ -2481,9 +2492,19 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
-        replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
-        replacements["SET_TEMP_BUFFERS"] = setTemps.str();
        map<string, string> defines;
+        stringstream defineAccum;
+        if (cu.getAccumulateInDouble()) {
+            defineAccum << "typedef double accum;\n";
+            defineAccum << "typedef double3 accum3;\n";
+            defines["make_accum3"] = "make_double3";
+        }
+        else {
+            defineAccum << "typedef real accum;\n";
+            defineAccum << "typedef real3 accum3;\n";
+            defines["make_accum3"] = "make_real3";
+        }
+        replacements["DEFINE_ACCUM"] = defineAccum.str();
        if (useCutoff)
            defines["USE_CUTOFF"] = "1";
        if (usePeriodic)

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -445,10 +445,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
+    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getAccumulateInDouble())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
        defines["ENABLE_SHUFFLE"] = "1";
+    stringstream defineAccum;
+    if (context.getAccumulateInDouble()) {
+        defineAccum << "typedef double accum;\n";
+        defineAccum << "typedef double3 accum3;\n";
+        defines["make_accum3"] = "make_double3";
+    }
+    else {
+        defineAccum << "typedef real accum;\n";
+        defineAccum << "typedef real3 accum3;\n";
+        defines["make_accum3"] = "make_real3";
+    }
+    replacements["DEFINE_ACCUM"] = defineAccum.str();
    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");

--- a/platforms/cuda2/src/CudaPlatform.cpp
+++ b/platforms/cuda2/src/CudaPlatform.cpp
@@ -88,12 +88,12 @@ CudaPlatform::CudaPlatform() {
 #ifdef _MSC_VER
    char* bindir = getenv("CUDA_BIN_PATH");
 	string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
    int length = GetShortPathName(nvcc.c_str(), NULL, 0);
 	if (length > 0) {
 		vector<char> shortName(length);
 	    GetShortPathName(nvcc.c_str(), &shortName[0], length);
 		nvcc = string(&shortName[0]);
 	}
    setPropertyDefaultValue(CudaCompiler(), nvcc);
    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
 #else

--- a/platforms/cuda2/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda2/src/kernels/customGBEnergyN2.cu
@@ -2,10 +2,11 @@
 #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
 #define TILE_SIZE 32
+DEFINE_ACCUM
 typedef struct {
    real4 posq;
-    real3 force;
+    accum3 force;
    ATOM_PARAMETER_DATA
 #ifdef NEED_PADDING
    float padding;
@@ -46,7 +47,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        const unsigned int tbx = threadIdx.x - tgx;
        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
        unsigned int x, y;
-        real3 force = make_real3(0);
+        accum3 force = make_accum3(0);
        DECLARE_ATOM1_DERIVATIVES
        if (pos < end) {
 #ifdef USE_CUTOFF
@@ -122,7 +123,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    }
                    energy += 0.5f*tempEnergy;
                    delta *= dEdR;
-                    force -= delta;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -140,7 +143,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    localData[localAtomIndex].posq = posq[j];
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                }
-                localData[localAtomIndex].force = make_real3(0);
+                localData[localAtomIndex].force = make_accum3(0);
                CLEAR_LOCAL_DERIVATIVES
 #ifdef USE_CUTOFF
                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
@@ -185,9 +188,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                        }
                        energy += tempEnergy;
                        delta *= dEdR;
-                        force -= delta;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        atom2 = tbx+tj;
-                        localData[atom2].force += delta;
+                        localData[atom2].force.x += delta.x;
+                        localData[atom2].force.y += delta.y;
+                        localData[atom2].force.z += delta.z;
                        RECORD_DERIVATIVE_2
 #ifdef USE_CUTOFF
                        }

--- a/platforms/cuda2/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda2/src/kernels/gbsaObc1.cu
@@ -4,6 +4,8 @@
 #define TILE_SIZE 32
 #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
+DEFINE_ACCUM
 /**
 * Reduce the Born sums to compute the Born radii.
 */
@@ -331,7 +333,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
 typedef struct {
    real x, y, z;
    real q;
-    real fx, fy, fz, fw;
+    accum fx, fy, fz, fw;
    real bornRadius;
 } AtomData2;
@@ -372,7 +374,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        const unsigned int tbx = threadIdx.x - tgx;
        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
        unsigned int x, y;
-        real4 force = make_real4(0);
+        accum4 force = make_accum4(0);
        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {

--- a/platforms/cuda2/src/kernels/nonbonded.cu
+++ b/platforms/cuda2/src/kernels/nonbonded.cu
 #define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+DEFINE_ACCUM
 typedef struct {
    real x, y, z;
    real q;
-    real fx, fy, fz;
+    accum fx, fy, fz;
    ATOM_PARAMETER_DATA
 #ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
@@ -47,7 +49,7 @@ extern "C" __global__ void computeNonbonded(
        const unsigned int tbx = threadIdx.x - tgx;
        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
        unsigned int x, y;
-        real3 force = make_real3(0);
+        accum3 force = make_accum3(0);
        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
@@ -124,9 +126,13 @@ extern "C" __global__ void computeNonbonded(
                    COMPUTE_INTERACTION
                    energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
-                    force -= delta*dEdR;
+                    force.x -= delta.x*dEdR;
+                    force.y -= delta.y*dEdR;
+                    force.z -= delta.z*dEdR;
 #else
-                    force -= dEdR1;
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
 #endif
 #ifdef USE_EXCLUSIONS
                    excl >>= 1;
@@ -191,7 +197,9 @@ extern "C" __global__ void computeNonbonded(
 #ifdef ENABLE_SHUFFLE
    #ifdef USE_SYMMETRIC
                                delta *= dEdR;
-                                force -= delta;
+                                force.x -= delta.x;
+                                force.y -= delta.y;
+                                force.z -= delta.z;
                                for (int i = 16; i >= 1; i /= 2) {
                                    delta.x += __shfl_xor(delta.x, i, 32);
                                    delta.y += __shfl_xor(delta.y, i, 32);
@@ -203,7 +211,9 @@ extern "C" __global__ void computeNonbonded(
                                    localData[tbx+j].fz += delta.z;
                                }
    #else
-                                force -= dEdR1;
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
                                for (int i = 16; i >= 1; i /= 2) {
                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
@@ -218,12 +228,16 @@ extern "C" __global__ void computeNonbonded(
 #else
    #ifdef USE_SYMMETRIC
                                delta *= dEdR;
-                                force -= delta;
+                                force.x -= delta.x;
+                                force.y -= delta.y;
+                                force.z -= delta.z;
                                tempBuffer[bufferIndex] = delta.x;
                                tempBuffer[bufferIndex+1] = delta.y;
                                tempBuffer[bufferIndex+2] = delta.z;
    #else
-                                force -= dEdR1;
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
                                tempBuffer[bufferIndex] = dEdR2.x;
                                tempBuffer[bufferIndex+1] = dEdR2.y;
                                tempBuffer[bufferIndex+2] = dEdR2.z;
@@ -287,12 +301,16 @@ extern "C" __global__ void computeNonbonded(
                            energy += tempEnergy;
 #ifdef USE_SYMMETRIC
                            delta *= dEdR;
-                            force -= delta;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            localData[tbx+tj].fx += delta.x;
                            localData[tbx+tj].fy += delta.y;
                            localData[tbx+tj].fz += delta.z;
 #else
-                            force -= dEdR1;
+                            force.x -= dEdR1.x;
+                            force.y -= dEdR1.y;
+                            force.z -= dEdR1.z;
                            localData[tbx+tj].fx += dEdR2.x;
                            localData[tbx+tj].fy += dEdR2.y;
                            localData[tbx+tj].fz += dEdR2.z;