Commit ad2d5c0a authored by Peter Eastman's avatar Peter Eastman
Browse files

Added volatile keywords to work correctly with CUDA 4.1

parent 0abe528f
......@@ -29,7 +29,7 @@ typedef unsigned int T;
// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
__device__ unsigned int sumReduce128(unsigned int* arr) {
__device__ unsigned int sumReduce128(volatile unsigned int* arr) {
// Parallel reduce element counts
// Assumes 128 thd/block
if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
......@@ -47,7 +47,7 @@ __device__ unsigned int sumReduce128(unsigned int* arr) {
}
__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
__shared__ unsigned int dsCount[128];
__shared__ volatile unsigned int dsCount[128];
dsCount[threadIdx.x] = 0;
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
......
......@@ -43,7 +43,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
......@@ -52,7 +52,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
float CDLJ_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
volatile float3* tempBuffer = (volatile float3*) &sA[cSim.nonbond_threads_per_block];
#endif
#ifdef USE_EWALD
......@@ -83,7 +83,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
......
......@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
......@@ -49,7 +49,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
float CDLJObcGbsa_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
......@@ -68,7 +68,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
......
......@@ -47,7 +47,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
......@@ -58,7 +58,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
// int end = workUnits / gridDim.x;
// int pos = end - (threadIdx.x >> GRIDBITS) - 1;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
while ( pos < end )
......@@ -79,7 +79,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
......
......@@ -47,14 +47,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
volatile float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
......@@ -72,7 +72,7 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
......
......@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
......@@ -48,7 +48,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif
while (pos < end)
......@@ -67,7 +67,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
......
......@@ -40,14 +40,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
volatile float3* tempBuffer = (volatile float3*) &sA[cSim.bornForce2_threads_per_block];
#endif
unsigned int lasty = -0xFFFFFFFF;
......@@ -65,7 +65,7 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
volatile Atom* psA = &sA[tbx];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
......
......@@ -115,7 +115,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
*/
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit)
{
extern __shared__ unsigned int flags[];
extern __shared__ volatile unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
......
......@@ -221,7 +221,7 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
* flags for which ones are interacting.
*/
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local unsigned int* restrict flags, unsigned int maxTiles) {
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
......
......@@ -2,7 +2,7 @@
* Calculate the center of mass momentum.
*/
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local float4* restrict temp) {
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0);
float4 cm = 0.0f;
while (index < numAtoms) {
......@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion.
*/
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local float4* restrict temp) {
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups.
int index = get_local_id(0);
......
......@@ -101,7 +101,7 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for
}
__device__ static void add3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
__device__ static void add3dArrayToFloat4( unsigned int offset, volatile float* forceSum, float4* outputForce )
{
float4 of;
......@@ -125,7 +125,7 @@ __device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum
}
__device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce )
__device__ static void load3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
{
outputForce[offset] = forceSum[0];
......@@ -134,7 +134,7 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float*
}
__device__ static void add3dArray( unsigned int offset, float* forceSum, float* outputForce )
__device__ static void add3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
{
outputForce[offset] += forceSum[0];
......
......@@ -468,7 +468,7 @@ __device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle&
}
#endif
static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* sA, unsigned int atomI ){
static __device__ void loadElectrostaticParticle( volatile struct ElectrostaticParticle* sA, unsigned int atomI ){
// coordinates & charge
......@@ -512,7 +512,7 @@ static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle*
}
static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){
static __device__ void zeroElectrostaticParticle( volatile struct ElectrostaticParticle* sA ){
sA->force[0] = 0.0f;
sA->force[1] = 0.0f;
sA->force[2] = 0.0f;
......
......@@ -42,7 +42,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
#endif
){
extern __shared__ ElectrostaticParticle sA[];
extern __shared__ volatile ElectrostaticParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
......@@ -69,7 +69,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
ElectrostaticParticle* psA = &sA[tbx];
volatile ElectrostaticParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
ElectrostaticParticle localParticle;
loadElectrostaticParticle( &localParticle, atomI );
......
__device__ void SUB_METHOD_NAME( calculateElectrostaticPairIxn, _kernel )( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ,
__device__ void SUB_METHOD_NAME( calculateElectrostaticPairIxn, _kernel )( ElectrostaticParticle& atomI, volatile ElectrostaticParticle& atomJ,
float* scalingFactors,
#ifdef F1
float* energy,
......
......@@ -117,7 +117,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractionsVdw, _kernel)()
*/
__global__ void METHOD_NAME(kFindInteractionsWithinBlocksVdw, _kernel)(unsigned int* workUnit)
{
extern __shared__ unsigned int flags[];
extern __shared__ volatile unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment