No commit message

No commit message

No commit message
2f1c9531 · Scott Le Grand · c85128ac · 2f1c9531 · 2f1c9531 · 2f1c9531
Commit 2f1c9531 authored Mar 22, 2010 by Scott Le Grand
4 changed files
--- a/platforms/cuda/src/kernels/kCCMA.cu
+++ b/platforms/cuda/src/kernels/kCCMA.cu
@@ -66,7 +66,15 @@ __device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
    __syncthreads();
 }
-__global__ void kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
+__global__ void 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
 {
    // Initialize counters used for monitoring convergence and doing global thread synchronization.

--- a/platforms/cuda/src/kernels/kCalculateCustomBondForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCustomBondForces.cu
@@ -78,7 +78,15 @@ void SetCustomBondGlobalParams(const vector<float>& paramValues)
 }
-__global__ void kCalculateCustomBondForces_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kCalculateCustomBondForces_kernel()
 {
    extern __shared__ float stack[];
    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];

--- a/platforms/cuda/src/kernels/kCalculateCustomExternalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCustomExternalForces.cu
@@ -82,7 +82,15 @@ void SetCustomExternalGlobalParams(const vector<float>& paramValues)
 }
-__global__ void kCalculateCustomExternalForces_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kCalculateCustomExternalForces_kernel()
 {
    extern __shared__ float stack[];
    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];

--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -92,7 +92,15 @@ inline __host__ __device__ float4 make_float4(int3 a)
    return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
 }
-__global__ void kUpdateGridIndexAndFraction_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kUpdateGridIndexAndFraction_kernel()
 {
    unsigned int tnb = blockDim.x * gridDim.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -114,7 +122,15 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
 * For each grid point, find the range of sorted atoms associated with that point.
 */
-__global__ void kFindAtomRangeForGrid_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kFindAtomRangeForGrid_kernel()
 {
    int thread = blockIdx.x*blockDim.x+threadIdx.x;
    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
@@ -147,7 +163,15 @@ __global__ void kFindAtomRangeForGrid_kernel()
    }
 }
-__global__ void kUpdateBsplines_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kUpdateBsplines_kernel()
 {
    unsigned int    tnb = blockDim.x * gridDim.x;
    unsigned int    tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -217,7 +241,15 @@ __global__ void kUpdateBsplines_kernel()
    }
 }
-__global__ void kGridSpreadCharge_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kGridSpreadCharge_kernel()
 {
    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
    unsigned int numThreads = gridDim.x*blockDim.x;
@@ -254,7 +286,15 @@ __global__ void kGridSpreadCharge_kernel()
    }
 }
-__global__ void kReciprocalConvolution_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kReciprocalConvolution_kernel()
 {
    const unsigned int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
    float expFactor = LOCAL_HACK_PI*LOCAL_HACK_PI/(cSim.alphaEwald*cSim.alphaEwald);
@@ -287,7 +327,15 @@ __global__ void kReciprocalConvolution_kernel()
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
 }
-__global__ void kGridInterpolateForce_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kGridInterpolateForce_kernel()
 {
    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < cSim.atoms; atom += blockDim.x*gridDim.x)
    {