Fixed errors running on compute 1.1 devices

0aca702a · Peter Eastman · c9b1338b · 0aca702a · 0aca702a · 0aca702a
Commit 0aca702a authored Aug 04, 2010 by Peter Eastman
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -1190,7 +1190,14 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
    kClearFields_3( amoebaGpu, 6 );
    if( threadsPerBlock == 0 ){
-        threadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle));
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 192;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 96;
+        else
+            maxThreads = 32;
+        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
    }   
    if( amoebaGpu->log && timestep == 1 ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -574,7 +574,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
    // set threads/block first time through
    if( threadsPerBlock == 0 ){
-        threadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle));
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 256;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
    }
    if (gpu->bOutputBufferPerWarp){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -176,7 +176,7 @@ __device__ void calculateVdw14_7PairIxn_kernel( float4 atomCoordinatesI, float4
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
+#elif (__CUDA_ARCH__ >= 120)
 __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
@@ -270,7 +270,14 @@ void kCalculateAmoebaVdw14_7Reduction_kernel( float* inputForce, float4* outputF
 static void kCalculateAmoebaVdw14_7Reduction(amoebaGpuContext amoebaGpu, CUDAStream<float>* vdwOutputArray, CUDAStream<float4>* forceOutputArray )
 {
-    kCalculateAmoebaVdw14_7Reduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>(
+    unsigned int threadsPerBlock;
+    if (amoebaGpu->gpuContext->sm_version >= SM_20)
+        threadsPerBlock = GF1XX_NONBOND_THREADS_PER_BLOCK;
+    else if (amoebaGpu->gpuContext->sm_version >= SM_12)
+        threadsPerBlock = GT2XX_NONBOND_THREADS_PER_BLOCK;
+    else
+        threadsPerBlock = G8X_NONBOND_THREADS_PER_BLOCK;
+    kCalculateAmoebaVdw14_7Reduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
                               vdwOutputArray->_pDevStream[0], forceOutputArray->_pDevStream[0] );
    LAUNCHERROR("kCalculateAmoebaVdw14_7Reduction");
 }
@@ -282,7 +289,7 @@ static void kCalculateAmoebaVdw14_7Reduction(amoebaGpuContext amoebaGpu, CUDAStr
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
+#elif (__CUDA_ARCH__ >= 120)
 __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
@@ -361,9 +368,16 @@ static void kCalculateAmoebaVdw14_7CoordinateReduction(amoebaGpuContext amoebaGp
                                                       CUDAStream<float4>* coordinateArray,
                                                       CUDAStream<float4>* reducedCoordinateArray)
 {
-    kCalculateAmoebaVdw14_7CoordinateReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>(
+    unsigned int threadsPerBlock;
+    if (amoebaGpu->gpuContext->sm_version >= SM_20)
+        threadsPerBlock = GF1XX_THREADS_PER_BLOCK;
+    else if (amoebaGpu->gpuContext->sm_version >= SM_12)
+        threadsPerBlock = GT2XX_THREADS_PER_BLOCK;
+    else
+        threadsPerBlock = G8X_THREADS_PER_BLOCK;
+    kCalculateAmoebaVdw14_7CoordinateReduction_kernel<<<amoebaGpu->gpuContext->sim.blocks, threadsPerBlock>>>(
                               coordinateArray->_pDevStream[0], reducedCoordinateArray->_pDevStream[0] );
-    LAUNCHERROR("kCalculateAmoebaVdw14_7Reduction");
+    LAUNCHERROR("kCalculateAmoebaVdw14_7CoordinateReduction");
 }
 // perform reduction of force on H's and add to heavy atom partner

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -408,8 +408,14 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
    // set threads/block first time through
    if( threadsPerBlock == 0 ){
-        threadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle));
+        unsigned int maxThreads;
-threadsPerBlock = 128;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 192;
+        else
+            maxThreads = 64;
+       threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle)), maxThreads);
    }
    if (gpu->bOutputBufferPerWarp){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
@@ -27,7 +27,7 @@
 __global__
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(384, 1)
-#elif (__CUDA_ARCH__ >= 130)
+#elif (__CUDA_ARCH__ >= 120)
 __launch_bounds__(192, 1)
 #else
 __launch_bounds__(64, 1)