Commit 8a331fb9 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Direct space optimizations

parent af4d503a
......@@ -3451,15 +3451,16 @@ tgx = 0;
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread )
unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
{
unsigned int grid = amoebaGpu->gpuContext->grid;
unsigned int threadsPerBlock = (amoebaGpu->gpuContext->sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock;
threadsPerBlock *= grid;
......
......@@ -160,7 +160,7 @@ extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAS
extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );
extern void kClearFields_1( amoebaGpuContext amoebaGpu );
extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread );
extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
//extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
......
......@@ -759,7 +759,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle), gpu->sharedMemoryPerBlock), maxThreads);
}
kClearFields_3( amoebaGpu, 1 );
......
......@@ -362,7 +362,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
kClearFields_3( amoebaGpu, 3 );
......
......@@ -108,7 +108,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -1813,7 +1813,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......
......@@ -978,7 +978,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
maxThreads = 96;
else
maxThreads = 32;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -490,7 +490,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -276,7 +276,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -4,7 +4,6 @@ struct MutualInducedParticle {
float x;
float y;
float z;
float q;
float inducedDipole[3];
float inducedDipolePolar[3];
......@@ -41,7 +40,6 @@ __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsig
sA->x = posq.x;
sA->y = posq.y;
sA->z = posq.z;
sA->q = posq.w;
// dipole
......
......@@ -239,6 +239,9 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// No interactions in this block.
} else {
#ifdef CALCULATE_FULL_TILE
flags = 0xFFFFFFFF;
#endif
sA[threadIdx.x].force[0] = 0.0f;
sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f;
......@@ -312,6 +315,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA[jIdx].torque[1] += forceTorqueEnergy[2].y;
psA[jIdx].torque[2] += forceTorqueEnergy[2].z;
#ifndef CALCULATE_FULL_TILE
} else {
sA[threadIdx.x].tempForce[0] = forceTorqueEnergy[0].x;
......@@ -345,6 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
}
#endif
}
} // end of atoms out-of-bounds
} // end of flags&(1<<j block
......
......@@ -437,7 +437,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
......@@ -469,7 +469,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
......
......@@ -37,10 +37,9 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
//#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
......@@ -51,6 +50,93 @@ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedPartic
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
const float uscale, float4* delta, float* preFactor2 ) {
// compute thedelta->xeal space portion of the Ewald summation
delta->x = atomJ.x - atomI.x;
delta->y = atomJ.y - atomI.y;
delta->z = atomJ.z - atomI.z;
// pdelta->xiodic boundary conditions
delta->x -= floor(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta->y -= floor(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta->z -= floor(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
delta->w = rr3 - bn1;
*preFactor2 = bn2 - rr5;
} else {
delta->w = *preFactor2 = 0.0f;
}
}
__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] += preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] += preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] += preFactor3*delta.z + delta.w*inducedDipole[2];
}
__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] = preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] = preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] = preFactor3*delta.z + delta.w*inducedDipole[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
......@@ -385,7 +471,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
......@@ -100,99 +100,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
for (unsigned int j = 0; j < GRID; j++)
{
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
#ifdef AMOEBA_DEBUG
/*
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
}
*/
#endif
float4 delta;
float prefactor2;
if( ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
}
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
}
......@@ -226,6 +144,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// No interactions in this block.
} else {
#ifndef INCLUDE_MI_FIELD_BUFFERS
flags = 0xFFFFFFFF;
#endif
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
......@@ -235,53 +157,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if ((flags&(1<<j)) != 0)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 ijField[3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0].x : 0.0f;
fieldSum[1] += mask ? ijField[1].x : 0.0f;
fieldSum[2] += mask ? ijField[2].x : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[0].z : 0.0f;
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
// add to field at atomJ the field due atomI's dipole
float4 delta;
float prefactor2;
if( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, &delta, &prefactor2 );
}
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
#ifndef INCLUDE_MI_FIELD_BUFFERS
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
#else
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[0].y : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1].y : 0.0f;
psA[jIdx].field[2] += mask ? ijField[2].y : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[0].w : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[1].w : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[2].w : 0.0f;
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
} else {
sA[threadIdx.x].tempBuffer[0] = mask ? ijField[0].y : 0.0;
sA[threadIdx.x].tempBuffer[1] = mask ? ijField[1].y : 0.0;
sA[threadIdx.x].tempBuffer[2] = mask ? ijField[2].y : 0.0;
sA[threadIdx.x].tempBufferP[0] = mask ? ijField[0].w : 0.0;
sA[threadIdx.x].tempBufferP[1] = mask ? ijField[1].w : 0.0;
sA[threadIdx.x].tempBufferP[2] = mask ? ijField[2].w : 0.0;
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipole, delta, prefactor2, sA[threadIdx.x].tempBuffer );
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipolePolar, delta, prefactor2, sA[threadIdx.x].tempBufferP );
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
......@@ -308,61 +202,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
}
}
/*
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
}
#endif
*/
}
tj = (tj + 1) & (GRID - 1);
......
......@@ -531,7 +531,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
maxThreads = 192;
else
maxThreads = 128;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if( 0 ){
......
......@@ -382,7 +382,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment