"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "16e2b1e32fbfdbe343171f1adaa44bcb8df41a0e"
Commit f3e727df authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1906 from peastman/volta

Support for Volta
parents 74ad9a16 e28c6947
...@@ -236,12 +236,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -236,12 +236,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
minor = 3; minor = 3;
} }
} }
if (major == 7) {
// Don't generate Volta-specific code until we've made the changes needed
// to support it properly.
major = 6;
minor = 0;
}
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
...@@ -263,6 +257,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -263,6 +257,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
if (cudaDriverVersion >= 9000) {
compilationDefines["SYNC_WARPS"] = "__syncwarp();";
compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
}
else {
compilationDefines["SYNC_WARPS"] = "";
compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
compilationDefines["BALLOT(var)"] = "__ballot(var);";
}
if (useDoublePrecision) { if (useDoublePrecision) {
posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq"); posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm"); velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
......
...@@ -225,7 +225,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi ...@@ -225,7 +225,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
// Loop over any blocks we identified as potentially containing neighbors. // Loop over any blocks we identified as potentially containing neighbors.
int includeBlockFlags = __ballot(includeBlock2); int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) { while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1; int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1; includeBlockFlags &= includeBlockFlags-1;
......
...@@ -115,7 +115,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign ...@@ -115,7 +115,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
int atom = atoms[i]; int atom = atoms[i];
int flag = flags[i]; int flag = flags[i];
bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS); bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
int includeFlags = __ballot(include); int includeFlags = BALLOT(include);
if (include) { if (include) {
int index = numCompacted+__popc(includeFlags&warpMask); int index = numCompacted+__popc(includeFlags&warpMask);
atoms[index] = atom; atoms[index] = atom;
...@@ -271,7 +271,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -271,7 +271,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Loop over any blocks we identified as potentially containing neighbors. // Loop over any blocks we identified as potentially containing neighbors.
int includeBlockFlags = __ballot(includeBlock2); int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) { while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1; int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1; includeBlockFlags &= includeBlockFlags-1;
...@@ -291,7 +291,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -291,7 +291,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(atomDelta) APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif #endif
int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w)); int atomFlags = BALLOT(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
int interacts = 0; int interacts = 0;
if (atom2 < NUM_ATOMS && atomFlags != 0) { if (atom2 < NUM_ATOMS && atomFlags != 0) {
int first = __ffs(atomFlags)-1; int first = __ffs(atomFlags)-1;
...@@ -317,7 +317,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea ...@@ -317,7 +317,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer. // Add any interacting atoms to the buffer.
int includeAtomFlags = __ballot(interacts); int includeAtomFlags = BALLOT(interacts);
if (interacts) { if (interacts) {
int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask); int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
buffer[index] = atom2; buffer[index] = atom2;
......
...@@ -15,22 +15,22 @@ typedef struct { ...@@ -15,22 +15,22 @@ typedef struct {
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
//support for 64 bit shuffles //support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) { static __inline__ __device__ float real_shfl(float var, int srcLane) {
return __shfl(var, srcLane); return SHFL(var, srcLane);
} }
static __inline__ __device__ double real_shfl(double var, int srcLane) { static __inline__ __device__ double real_shfl(double var, int srcLane) {
int hi, lo; int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
hi = __shfl(hi, srcLane); hi = SHFL(hi, srcLane);
lo = __shfl(lo, srcLane); lo = SHFL(lo, srcLane);
return __hiloint2double( hi, lo ); return __hiloint2double( hi, lo );
} }
static __inline__ __device__ long long real_shfl(long long var, int srcLane) { static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
int hi, lo; int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
hi = __shfl(hi, srcLane); hi = SHFL(hi, srcLane);
lo = __shfl(lo, srcLane); lo = SHFL(lo, srcLane);
// unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
int2 fuse; fuse.x = lo; fuse.y = hi; int2 fuse; fuse.x = lo; fuse.y = hi;
return *reinterpret_cast<long long*>(&fuse); return *reinterpret_cast<long long*>(&fuse);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment