"wrappers/vscode:/vscode.git/clone" did not exist on "00e3b767689bb96429a41dde35260c66dc2a02c1"
Commit a9054686 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods for direct PME

parent 01260070
......@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
log = NULL;
contextImpl = NULL;
gpuInitialized = false;
applyCutoff = 0;
multipoleForceCount = 0;
}
AmoebaCudaData::~AmoebaCudaData() {
......@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
return;
}
void AmoebaCudaData::incrementMultipoleForceCount( void ) {
multipoleForceCount++;
}
int AmoebaCudaData::getMultipoleForceCount( void ) const {
return multipoleForceCount;
}
void AmoebaCudaData::setApplyCutoff( int inputApplyCutoff ) {
applyCutoff = inputApplyCutoff;
}
int AmoebaCudaData::getApplyCutoff( void ) const {
return applyCutoff;
}
}
......@@ -139,11 +139,41 @@ public:
*/
void setContextImpl( void* contextImpl );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getMultipoleForceCount( void ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void incrementMultipoleForceCount( void );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getApplyCutoff( ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void setApplyCutoff( int applyCutoff );
private:
CudaPlatform::PlatformData& cudaPlatformData;
amoebaGpuContext amoebaGpu;
bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
int multipoleForceCount;
int applyCutoff;
KernelImpl* localForceKernel;
unsigned int kernelCount;
void* contextImpl;
......
......@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
amoebaGpuContext gpu = data.getAmoebaGpu();
if( data.getMultipoleForceCount() == 0 ){
gpuCopyInteractingWorkUnit( gpu );
}
if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
gpuReorderAtoms(gpu->gpuContext);
}
data.incrementMultipoleForceCount();
data.initializeGpu();
if( 0 && data.getLog() ){
......@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize = pmeGridDimension[2];
}
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
data.setApplyCutoff( 1 );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
gpu->sim.nonbondedMethod = PARTICLE_MESH_EWALD;
}
data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
}
......
......@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi );
(void) fprintf( log, " alpha Ewald %15.7e\n", gpu->sim.alphaEwald );
(void) fprintf( log, " PME grid dimensions %6d %6d %6d\n", gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z);
(void) fprintf( log, " cutoffDistance2 %15.7e\n", amoebaGpu->amoebaSim.cutoffDistance2 );
(void) fprintf( log, " nonbondedCutoffSqr %15.7e\n", gpu->sim.nonbondedCutoffSqr);
(void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric );
(void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
(void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc );
......@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
(void) fflush( amoebaGpu->log );
}
amoebaGpu->amoebaSim.cutoffDistance2 = cutoffDistance*cutoffDistance;
amoebaGpu->amoebaSim.sqrtPi = std::sqrt( 3.14159265358f );
amoebaGpu->amoebaSim.electric = electricConstant;
amoebaGpu->gpuContext->sim.alphaEwald = alphaEwald;
......@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
}
}
/**---------------------------------------------------------------------------------------
Track iterations for MI dipoles
@param amoebaGpu amoebaGpuContext reference
@param iteration MI iteration
--------------------------------------------------------------------------------------- */
void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
gpu->psInteractingWorkUnit->Download();
gpu->psWorkUnit->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
gpu->psWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
}
gpu->psInteractingWorkUnit->Upload();
gpu->psWorkUnit->Upload();
// ---------------------------------------------------------------------------------------
}
#undef AMOEBA_DEBUG
......@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms
float cutoffDistance2; // cutoff distance squared for PME
//float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI)
float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors
......
......@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
extern "C"
void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
extern "C"
void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
/*
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
......
......@@ -44,6 +44,11 @@ struct FixedFieldParticle {
float gkField[3];
#endif
#ifdef INCLUDE_FIXED_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
};
__device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI
......
......@@ -24,6 +24,11 @@ struct MutualInducedParticle {
float fieldS[3];
float fieldPolarS[3];
#endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
};
__device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )
......
......@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phi = &cAmoebaSim.pPhi[20*i];
cAmoebaSim.pTorque[3*i] = -cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
cAmoebaSim.pTorque[3*i] = cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9]
+ multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5]
- multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]);
cAmoebaSim.pTorque[3*i+1] = -cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
cAmoebaSim.pTorque[3*i+1] = cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8]
+ multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6]
- multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]);
cAmoebaSim.pTorque[3*i+2] = -cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
cAmoebaSim.pTorque[3*i+2] = cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7]
+ multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8]
- multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]);
......@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i];
force.x += f.x;
force.y += f.y;
force.z += f.z;
force.x -= f.x;
force.y -= f.y;
force.z -= f.z;
cSim.pForce4[i] = force;
......@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phidp = &cAmoebaSim.pPhidp[20*i];
cAmoebaSim.pTorque[3*i] = -0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
+ multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
- multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
cAmoebaSim.pTorque[3*i+1] = -0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
+ multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
cAmoebaSim.pTorque[3*i+2] = -0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
- multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
......@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i];
force.x += f.x;
force.y += f.y;
force.z += f.z;
force.x -= f.x;
force.y -= f.y;
force.z -= f.z;
cSim.pForce4[i] = force;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy;
......
......@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
float torque[3];
float padding;
float tempForce[3];
float tempTorque[3];
};
__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
atomI.tempForce[0] += atomJ.tempForce[0];
atomI.tempForce[1] += atomJ.tempForce[1];
atomI.tempForce[2] += atomJ.tempForce[2];
atomI.tempTorque[0] += atomJ.tempTorque[0];
atomI.tempTorque[1] += atomJ.tempTorque[1];
atomI.tempTorque[2] += atomJ.tempTorque[2];
}
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
......@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
atomI.torque[0] -= term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
atomI.torque[1] -= term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] -= term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
}
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
......@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float gfr[8],gfri[7];
float gti[7],gtri[7];
float conversionFactor = (cAmoebaSim.electric/cAmoebaSim.dielec);
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
......@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cAmoebaSim.cutoffDistance2 ){
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2);
float ck = atomJ.q;
......@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli;
*energy = conversionFactor*(e + ei);
*energy = -conversionFactor*(e + ei);
// increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell;
......@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
}
kClearFields_3( amoebaGpu, 2 );
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
(sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
gpu->sim.nonbond_threads_per_block );
(void) fflush( amoebaGpu->log );
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else {
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
// gpu->sim.pInteractingWorkUnit,
// amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
int maxPrint = 5;
float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f};
for( int ii = 0; ii < gpu->natoms; ii++ ){
......@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
(void) fprintf( amoebaGpu->log,"\n" );
if( 1 ){
if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
......
......@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned int x;
unsigned int y;
bool bExclusionFlag;
int dScaleMask;
int2 pScaleMask;
int2 mScaleMask;
// Extract cell coordinates
......@@ -99,49 +102,16 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
if (!bExclusionFlag)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
}
}
else // bExclusion
if (bExclusionFlag)
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
......@@ -153,9 +123,12 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// set scale factors
if (bExclusionFlag)
{
getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
}
// force
......@@ -229,8 +202,7 @@ if( atomI == targetAtom ){
}
#endif
}
}
} // end of j-loop
// include self energy and self torque
......@@ -281,11 +253,15 @@ if( atomI == targetAtom ){
outputTorque[offset+2] = localParticle.torque[2];
#endif
}
else
{
if (lasty != y)
{
} else {
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y) {
// load shared data
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
......@@ -300,133 +276,43 @@ if( atomI == targetAtom ){
sA[threadIdx.x].torque[1] = 0.0f;
sA[threadIdx.x].torque[2] = 0.0f;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
unsigned int atomJ = y + tj;
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[tj],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I
psA[tj].force[0] -= mask ? force[0] : 0.0f;
psA[tj].force[1] -= mask ? force[1] : 0.0f;
psA[tj].force[2] -= mask ? force[2] : 0.0f;
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f;
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f;
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f;
float blockId = 2.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceSign*force[0] : 0.0f;
debugArray[index].y = mask ? forceSign*force[1] : 0.0f;
debugArray[index].z = mask ? forceSign*force[2] : 0.0f;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexI][0] : 0.0f;
debugArray[index].y = mask ? torque[indexI][1] : 0.0f;
debugArray[index].z = mask ? torque[indexI][2] : 0.0f;
debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f;
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f;
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
else // bExclusion
if( bExclusionFlag )
{
// Read fixed atom data into registers and GRF
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
float force[3];
float torque[2][3];
unsigned int atomJ = y + tj;
// set scale factors
getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
if( bExclusionFlag )
{
getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
}
// force
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[tj],
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
, pullBack
#endif
);
......@@ -448,101 +334,58 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom J due atom I
psA[tj].force[0] -= mask ? force[0] : 0.0f;
psA[tj].force[1] -= mask ? force[1] : 0.0f;
psA[tj].force[2] -= mask ? force[2] : 0.0f;
if( flags == 0xFFFFFFFF ){
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f;
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f;
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
psA[jIdx].force[0] -= mask ? force[0] : 0.0f;
psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f;
float blockId = 3.0f;
} else {
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceSign*force[0] : 0.0f;
debugArray[index].y = mask ? forceSign*force[1] : 0.0f;
debugArray[index].z = mask ? forceSign*force[2] : 0.0f;
debugArray[index].w = blockId;
psA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
psA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
psA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexI][0] : 0.0f;
debugArray[index].y = mask ? torque[indexI][1] : 0.0f;
debugArray[index].z = mask ? torque[indexI][2] : 0.0f;
debugArray[index].w = energy;
psA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
psA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
psA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f;
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f;
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[DScaleIndex];
debugArray[index].y = dScaleVal;
debugArray[index].z = scalingFactors[PScaleIndex];
debugArray[index].w = pScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[MScaleIndex];
debugArray[index].y = mScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomI];
debugArray[index].y = labFrameDipole[3*atomI+1];
debugArray[index].z = labFrameDipole[3*atomI+2];
debugArray[index].w = 25.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomJ];
debugArray[index].y = labFrameDipole[3*atomJ+1];
debugArray[index].z = labFrameDipole[3*atomJ+2];
debugArray[index].w = 26.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = jDipole[0];
debugArray[index].y = jDipole[1];
debugArray[index].z = jDipole[2];
debugArray[index].w = 27.0f;
#endif
}
#endif
if (tgx == 0)
{
psA[jIdx].force[0] -= psA[threadIdx.x].tempForce[0] + psA[threadIdx.x+16].tempForce[0];
psA[jIdx].force[1] -= psA[threadIdx.x].tempForce[1] + psA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= psA[threadIdx.x].tempForce[2] + psA[threadIdx.x+16].tempForce[2];
tj = (tj + 1) & (GRID - 1);
psA[jIdx].torque[0] += psA[threadIdx.x].tempTorque[0] + psA[threadIdx.x+16].tempTorque[0];
psA[jIdx].torque[1] += psA[threadIdx.x].tempTorque[1] + psA[threadIdx.x+16].tempTorque[1];
psA[jIdx].torque[2] += psA[threadIdx.x].tempTorque[2] + psA[threadIdx.x+16].tempTorque[2];
}
}
tj = (tj + 1) & (GRID - 1);
} // end of j-loop
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
......@@ -596,7 +439,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of;
#else
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
......@@ -617,13 +460,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2];
#endif
#endif
lasty = y;
}
} // end of pInteractionFlag block
}
pos++;
}
//printf( "Hello thread: %d %d %d %d\n", blockIdx.x * blockDim.x + threadIdx.x, blockIdx.x, blockDim.x, threadIdx.x );
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
......@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
......@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
// Reduce field
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
//const float term = 0.0f;
while (pos < fieldComponents)
{
......@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float fields[4][3]
#ifdef AMOEBA_DEBUG
......@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrtf(r2);
// calculate the error function damping terms
......@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fim[0] - fid[0];
fields[0][1] = fim[1] - fid[1];
......@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields[2][2] = 0.0f;
fields[3][2] = 0.0f;
}
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
......@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
......@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom
unsigned int targetAtom = 0;
int maxPrint = 3002;
amoebaGpu->psE_Field->Download();
(void) fprintf( amoebaGpu->log, "Recip EFields In\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
int isNan = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n",
amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "Recip EFields End\n" );
unsigned int targetAtom = 354;
#endif
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
#endif
} else {
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReducePmeDirectE_Fields( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log );
......@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
*/
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
int maxPrint = 3002;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
......@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
amoebaGpu->gpuContext->psPosq4->Download();
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
(void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
for( int kk = 0; kk < 10; kk++ ){
for( int kk = 0; kk < 6; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
(void) fprintf( amoebaGpu->log,"\n" );
}
}
......@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
delete debugArray;
}
#endif
if( 0 ){
if( 1 ){
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
......
......@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
){
#ifdef AMOEBA_DEBUG
int pullIndexMax = 12;
int maxPullIndex = 1;
float4 pullBack[12];
float dScaleVal;
float pScaleVal;
#endif
extern __shared__ FixedFieldParticle sA[];
......@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int x;
unsigned int y;
bool bExclusionFlag;
float dScaleValue;
float pScaleValue;
int dScaleMask;
int2 pScaleMask;
// extract cell coordinates
......@@ -97,75 +99,31 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
if (!bExclusionFlag)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3];
// load coords, charge, ...
#ifdef AMOEBA_DEBUG
dScaleVal = 1.0f;
pScaleVal = 1.0f;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], 1.0f, 1.0f, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int match = (atomI == (y + j)) ? 1 : 0;
match = 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += match ? 0.0f : ijField[0][0];
fieldSum[1] += match ? 0.0f : ijField[0][1];
fieldSum[2] += match ? 0.0f : ijField[0][2];
fieldPolarSum[0] += match ? 0.0f : ijField[2][0];
fieldPolarSum[1] += match ? 0.0f : ijField[2][1];
fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
}
}
else // bExclusion
{
if( bExclusionFlag ){
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
// load coords, charge, ...
float ijField[4][3];
float dScaleValue;
float pScaleValue;
if( bExclusionFlag ){
getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
}
#ifdef AMOEBA_DEBUG
dScaleVal = dScaleValue;
pScaleVal = pScaleValue;
#endif
float ijField[4][3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
......@@ -193,13 +151,6 @@ if( atomI == targetAtom ){
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int off = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
debugArray[index].x = (float) x;
debugArray[index].y = (float) tgx;
debugArray[index].z = -2;
debugArray[index].w = (float) off;
float flag = 7.0f;
for( int ii = 0; ii < 4; ii++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
......@@ -208,8 +159,7 @@ if( atomI == targetAtom ){
debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2];
debugArray[index].w = flag;
}
for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
......@@ -218,28 +168,9 @@ if( atomI == targetAtom ){
}
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
}
}
// Write results
......@@ -253,12 +184,14 @@ if( atomI == targetAtom ){
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
} else {
unsigned int flags = cSim.pInteractionFlag[pos];
// flags = 0xFFFFFFFF;
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y ) {
// load coordinates, charge, ...
......@@ -270,26 +203,32 @@ if( atomI == targetAtom ){
zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
if( bExclusionFlag ) {
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
float ijField[4][3];
for (unsigned int j = 0; j < GRID; j++){
// load coords, charge, ...
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
if( bExclusionFlag ){
getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
#ifdef AMOEBA_DEBUG
dScaleVal = 1.0f;
pScaleVal = 1.0f;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], 1.0f, 1.0f, ijField
float ijField[4][3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
......@@ -301,205 +240,104 @@ pScaleVal = 1.0f;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1];
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[tj].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[tj].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[tj].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
psA[tj].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[tj].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[tj].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
unsigned int pullBackIndex = 0;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
float flag = 8.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
#if 0
} else {
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleValue + 10.0f;
#endif
}
#endif
psA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[1][2];
tj = (tj + 1) & (GRID - 1);
psA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
if (tgx == 0)
{
// load coords, charge, ...
float ijField[4][3];
float dScaleValue;
float pScaleValue;
getMaskedDScaleFactor( tj, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( tj, pScaleMask, &pScaleValue );
#ifdef AMOEBA_DEBUG
dScaleVal = dScaleValue;
pScaleVal = pScaleValue;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0][0];
fieldSum[1] += outOfBounds ? 0.0f : ijField[0][1];
fieldSum[2] += outOfBounds ? 0.0f : ijField[0][2];
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[2][0];
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1];
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[tj].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[tj].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[tj].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
psA[tj].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[tj].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[tj].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
psA[jIdx].eField[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
unsigned int pullBackIndex = 0;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
float flag = 9.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ+1][2];
debugArray[index].w = flag;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
} // j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
......@@ -520,8 +358,9 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#endif
} // end of pInteractionFlag block
lasty = y;
}
} // x == y block
pos++;
}
......
......@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
......@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){
if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fimd[0] - fid[0];
fields[1][0] = fkmd[0] - fkd[0];
......@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
......@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock,
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
......@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
}
}
(void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
}
#endif
iteration++;
......@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
if( 0 ){
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
......
......@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cAmoebaSim.cutoffDistance2;
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f;
......@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
} else {
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y)
{
unsigned int atomJ = y + tgx;
......@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float ijField[4][3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// add to field at atomJ the field due atomI's dipole
psA[tj].field[0] += mask ? ijField[1][0] : 0.0f;
psA[tj].field[1] += mask ? ijField[1][1] : 0.0f;
psA[tj].field[2] += mask ? ijField[1][2] : 0.0f;
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[1][0] : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1][1] : 0.0f;
psA[jIdx].field[2] += mask ? ijField[1][2] : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[tj].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
psA[jIdx].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
} else {
psA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+tj) : atomI;
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = cAmoebaSim.cutoffDistance2;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f;
......@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
tj = (tj + 1) & (GRID - 1);
}
} // end of j-loop
// Write results
......@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
#endif
lasty = y;
}
} // end of pInteractionFlag block
} // end of x == y block
pos++;
}
}
......@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
amoebaGpu->psForce->Download();
psCudaForce4->Download();
amoebaGpu->torqueMapForce->Download();
amoebaGpu->psTorque->Download();
int maxPrint = 10;
(void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
......@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
......@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0],
psCudaForce4->_pDevStream[0] );
LAUNCHERROR("amoebaMapTorqueReduce_kernel2");
LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......
......@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
}
#undef USE_PERIODIC
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef USE_PERIODIC
#undef METHOD_NAME
void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood )
{
std::string methodName = "kCalculateAmoebaMultipoleForces";
......@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEField( amoebaGpu );
cudaComputeAmoebaMutualInducedField( amoebaGpu );
} else {
gpuContext gpu = amoebaGpu->gpuContext;
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if( 0 ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "Cell %8u %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " %8u [%5u %5u %1u] %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
}
} else {
}
cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
}
......
......@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
MapStringDouble tinkerEnergies;
MapStringVectorOfVectors supplementary;
MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE );
bool gkIsActive;
if( isPresent != forceMap.end() && isPresent->second != 0 ){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment