Merge pull request #1906 from peastman/volta

Support for Volta

Merge pull request #1906 from peastman/volta
Support for Volta
f3e727df · peastman · GitHub · 74ad9a16 · e28c6947 · f3e727df
Commit f3e727df authored Sep 25, 2017 by peastman Committed by GitHub Sep 25, 2017
4 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -236,12 +236,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            minor = 3;
        }
    }
-    if (major == 7) {
-        // Don't generate Volta-specific code until we've made the changes needed
-        // to support it properly.
-        major = 6;
-        minor = 0;
-    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
@@ -263,6 +257,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
+    if (cudaDriverVersion >= 9000) {
+        compilationDefines["SYNC_WARPS"] = "__syncwarp();";
+        compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
+        compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
+    }
+    else {
+        compilationDefines["SYNC_WARPS"] = "";
+        compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
+        compilationDefines["BALLOT(var)"] = "__ballot(var);";
+    }
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -225,7 +225,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
            // Loop over any blocks we identified as potentially containing neighbors.
-            int includeBlockFlags = __ballot(includeBlock2);
+            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -115,7 +115,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
        int atom = atoms[i];
        int flag = flags[i];
        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
-        int includeFlags = __ballot(include);
+        int includeFlags = BALLOT(include);
        if (include) {
            int index = numCompacted+__popc(includeFlags&warpMask);
            atoms[index] = atom;
@@ -271,7 +271,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
            // Loop over any blocks we identified as potentially containing neighbors.
-            int includeBlockFlags = __ballot(includeBlock2);
+            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
@@ -291,7 +291,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
 #endif
-                int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
+                int atomFlags = BALLOT(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
                int interacts = 0;
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
                    int first = __ffs(atomFlags)-1;
@@ -317,7 +317,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                // Add any interacting atoms to the buffer.
-                int includeAtomFlags = __ballot(interacts);
+                int includeAtomFlags = BALLOT(interacts);
                if (interacts) {
                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
                    buffer[index] = atom2;

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -15,22 +15,22 @@ typedef struct {
 #ifdef ENABLE_SHUFFLE
 //support for 64 bit shuffles
 static __inline__ __device__ float real_shfl(float var, int srcLane) {
-    return __shfl(var, srcLane);
+    return SHFL(var, srcLane);
 }
 static __inline__ __device__ double real_shfl(double var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
-    hi = __shfl(hi, srcLane);
+    hi = SHFL(hi, srcLane);
-    lo = __shfl(lo, srcLane);
+    lo = SHFL(lo, srcLane);
    return __hiloint2double( hi, lo );
 }
 static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
-    hi = __shfl(hi, srcLane);
+    hi = SHFL(hi, srcLane);
-    lo = __shfl(lo, srcLane);
+    lo = SHFL(lo, srcLane);
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);