Commit ad2d5c0a authored by Peter Eastman's avatar Peter Eastman
Browse files

Added volatile keywords to work correctly with CUDA 4.1

parent 0abe528f
...@@ -29,7 +29,7 @@ typedef unsigned int T; ...@@ -29,7 +29,7 @@ typedef unsigned int T;
// Phase 1: Count valid elements per thread block // Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk // Hard-code 128 thd/blk
__device__ unsigned int sumReduce128(unsigned int* arr) { __device__ unsigned int sumReduce128(volatile unsigned int* arr) {
// Parallel reduce element counts // Parallel reduce element counts
// Assumes 128 thd/block // Assumes 128 thd/block
if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64]; if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
...@@ -47,7 +47,7 @@ __device__ unsigned int sumReduce128(unsigned int* arr) { ...@@ -47,7 +47,7 @@ __device__ unsigned int sumReduce128(unsigned int* arr) {
} }
__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) { __global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
__shared__ unsigned int dsCount[128]; __shared__ volatile unsigned int dsCount[128];
dsCount[threadIdx.x] = 0; dsCount[threadIdx.x] = 0;
size_t ub; size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock); ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
......
...@@ -43,7 +43,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -43,7 +43,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID; unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
...@@ -52,7 +52,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) ...@@ -52,7 +52,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
float CDLJ_energy; float CDLJ_energy;
float energy = 0.0f; float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block]; volatile float3* tempBuffer = (volatile float3*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
#ifdef USE_EWALD #ifdef USE_EWALD
...@@ -83,7 +83,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) ...@@ -83,7 +83,7 @@ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
unsigned int i = x + tgx; unsigned int i = x + tgx;
apos = cSim.pPosq[i]; apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i]; float2 a = cSim.pAttr[i];
......
...@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit ) void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
...@@ -49,7 +49,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit ) ...@@ -49,7 +49,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
float CDLJObcGbsa_energy; float CDLJObcGbsa_energy;
float energy = 0.0f; float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
unsigned int lasty = -0xFFFFFFFF; unsigned int lasty = -0xFFFFFFFF;
...@@ -68,7 +68,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit ) ...@@ -68,7 +68,7 @@ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
float br = cSim.pBornRadii[i]; float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
float4 af; float4 af;
af.x = 0.0f; af.x = 0.0f;
af.y = 0.0f; af.y = 0.0f;
......
...@@ -47,7 +47,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -47,7 +47,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
...@@ -58,7 +58,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit) ...@@ -58,7 +58,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
// int end = workUnits / gridDim.x; // int end = workUnits / gridDim.x;
// int pos = end - (threadIdx.x >> GRIDBITS) - 1; // int pos = end - (threadIdx.x >> GRIDBITS) - 1;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
while ( pos < end ) while ( pos < end )
...@@ -79,7 +79,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit) ...@@ -79,7 +79,7 @@ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
......
...@@ -47,14 +47,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1) ...@@ -47,14 +47,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif #endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit ) METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block]; volatile float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
#endif #endif
unsigned int lasty = -0xFFFFFFFF; unsigned int lasty = -0xFFFFFFFF;
...@@ -72,7 +72,7 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit ) ...@@ -72,7 +72,7 @@ METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
float fb = cSim.pBornForce[i]; float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
float3 af; float3 af;
sA[threadIdx.x].fx = af.x = 0.0f; sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f; sA[threadIdx.x].fy = af.y = 0.0f;
......
...@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -40,7 +40,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
...@@ -48,7 +48,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -48,7 +48,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
while (pos < end) while (pos < end)
...@@ -67,7 +67,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) ...@@ -67,7 +67,7 @@ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
......
...@@ -40,14 +40,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1) ...@@ -40,14 +40,14 @@ __launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit) void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ volatile Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block]; volatile float3* tempBuffer = (volatile float3*) &sA[cSim.bornForce2_threads_per_block];
#endif #endif
unsigned int lasty = -0xFFFFFFFF; unsigned int lasty = -0xFFFFFFFF;
...@@ -65,7 +65,7 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit) ...@@ -65,7 +65,7 @@ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
float fb = cSim.pBornForce[i]; float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
Atom* psA = &sA[tbx]; volatile Atom* psA = &sA[tbx];
float3 af; float3 af;
sA[threadIdx.x].fx = af.x = 0.0f; sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f; sA[threadIdx.x].fy = af.y = 0.0f;
......
...@@ -115,7 +115,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)() ...@@ -115,7 +115,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
*/ */
__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit) __global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit)
{ {
extern __shared__ unsigned int flags[]; extern __shared__ volatile unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
......
...@@ -221,7 +221,7 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox ...@@ -221,7 +221,7 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
* flags for which ones are interacting. * flags for which ones are interacting.
*/ */
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter, __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local unsigned int* restrict flags, unsigned int maxTiles) { __global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local float4* restrict temp) { __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0); int index = get_global_id(0);
float4 cm = 0.0f; float4 cm = 0.0f;
while (index < numAtoms) { while (index < numAtoms) {
...@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest ...@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion. * Remove center of mass motion.
*/ */
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local float4* restrict temp) { __kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
int index = get_local_id(0); int index = get_local_id(0);
......
...@@ -101,7 +101,7 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for ...@@ -101,7 +101,7 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for
} }
__device__ static void add3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce ) __device__ static void add3dArrayToFloat4( unsigned int offset, volatile float* forceSum, float4* outputForce )
{ {
float4 of; float4 of;
...@@ -125,7 +125,7 @@ __device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum ...@@ -125,7 +125,7 @@ __device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum
} }
__device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce ) __device__ static void load3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
{ {
outputForce[offset] = forceSum[0]; outputForce[offset] = forceSum[0];
...@@ -134,7 +134,7 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float* ...@@ -134,7 +134,7 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float*
} }
__device__ static void add3dArray( unsigned int offset, float* forceSum, float* outputForce ) __device__ static void add3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
{ {
outputForce[offset] += forceSum[0]; outputForce[offset] += forceSum[0];
......
...@@ -468,7 +468,7 @@ __device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle& ...@@ -468,7 +468,7 @@ __device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle&
} }
#endif #endif
static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* sA, unsigned int atomI ){ static __device__ void loadElectrostaticParticle( volatile struct ElectrostaticParticle* sA, unsigned int atomI ){
// coordinates & charge // coordinates & charge
...@@ -512,7 +512,7 @@ static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* ...@@ -512,7 +512,7 @@ static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle*
} }
static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){ static __device__ void zeroElectrostaticParticle( volatile struct ElectrostaticParticle* sA ){
sA->force[0] = 0.0f; sA->force[0] = 0.0f;
sA->force[1] = 0.0f; sA->force[1] = 0.0f;
sA->force[2] = 0.0f; sA->force[2] = 0.0f;
......
...@@ -42,7 +42,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)( ...@@ -42,7 +42,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
#endif #endif
){ ){
extern __shared__ ElectrostaticParticle sA[]; extern __shared__ volatile ElectrostaticParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID; unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
...@@ -69,7 +69,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)( ...@@ -69,7 +69,7 @@ void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
unsigned int tbx = threadIdx.x - tgx; unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx; unsigned int tj = tgx;
ElectrostaticParticle* psA = &sA[tbx]; volatile ElectrostaticParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx; unsigned int atomI = x + tgx;
ElectrostaticParticle localParticle; ElectrostaticParticle localParticle;
loadElectrostaticParticle( &localParticle, atomI ); loadElectrostaticParticle( &localParticle, atomI );
......
__device__ void SUB_METHOD_NAME( calculateElectrostaticPairIxn, _kernel )( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ, __device__ void SUB_METHOD_NAME( calculateElectrostaticPairIxn, _kernel )( ElectrostaticParticle& atomI, volatile ElectrostaticParticle& atomJ,
float* scalingFactors, float* scalingFactors,
#ifdef F1 #ifdef F1
float* energy, float* energy,
......
...@@ -117,7 +117,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractionsVdw, _kernel)() ...@@ -117,7 +117,7 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractionsVdw, _kernel)()
*/ */
__global__ void METHOD_NAME(kFindInteractionsWithinBlocksVdw, _kernel)(unsigned int* workUnit) __global__ void METHOD_NAME(kFindInteractionsWithinBlocksVdw, _kernel)(unsigned int* workUnit)
{ {
extern __shared__ unsigned int flags[]; extern __shared__ volatile unsigned int flags[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment