Commit a9054686 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods for direct PME

parent 01260070
...@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor ...@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
log = NULL; log = NULL;
contextImpl = NULL; contextImpl = NULL;
gpuInitialized = false; gpuInitialized = false;
applyCutoff = 0;
multipoleForceCount = 0;
} }
AmoebaCudaData::~AmoebaCudaData() { AmoebaCudaData::~AmoebaCudaData() {
...@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) { ...@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
return; return;
} }
void AmoebaCudaData::incrementMultipoleForceCount( void ) {
multipoleForceCount++;
}
int AmoebaCudaData::getMultipoleForceCount( void ) const {
return multipoleForceCount;
}
void AmoebaCudaData::setApplyCutoff( int inputApplyCutoff ) {
applyCutoff = inputApplyCutoff;
}
int AmoebaCudaData::getApplyCutoff( void ) const {
return applyCutoff;
}
} }
...@@ -139,11 +139,41 @@ public: ...@@ -139,11 +139,41 @@ public:
*/ */
void setContextImpl( void* contextImpl ); void setContextImpl( void* contextImpl );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getMultipoleForceCount( void ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void incrementMultipoleForceCount( void );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getApplyCutoff( ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void setApplyCutoff( int applyCutoff );
private: private:
CudaPlatform::PlatformData& cudaPlatformData; CudaPlatform::PlatformData& cudaPlatformData;
amoebaGpuContext amoebaGpu; amoebaGpuContext amoebaGpu;
bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole; bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
int multipoleForceCount;
int applyCutoff;
KernelImpl* localForceKernel; KernelImpl* localForceKernel;
unsigned int kernelCount; unsigned int kernelCount;
void* contextImpl; void* contextImpl;
......
...@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo ...@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
amoebaGpuContext gpu = data.getAmoebaGpu(); amoebaGpuContext gpu = data.getAmoebaGpu();
if( data.getMultipoleForceCount() == 0 ){
gpuCopyInteractingWorkUnit( gpu );
}
if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
gpuReorderAtoms(gpu->gpuContext);
}
data.incrementMultipoleForceCount();
data.initializeGpu(); data.initializeGpu();
if( 0 && data.getLog() ){ if( 0 && data.getLog() ){
...@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize = pmeGridDimension[2]; zsize = pmeGridDimension[2];
} }
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize); gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
data.setApplyCutoff( 1 );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
gpu->sim.nonbondedMethod = PARTICLE_MESH_EWALD;
} }
data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force)); data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
} }
......
...@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi ); (void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi );
(void) fprintf( log, " alpha Ewald %15.7e\n", gpu->sim.alphaEwald ); (void) fprintf( log, " alpha Ewald %15.7e\n", gpu->sim.alphaEwald );
(void) fprintf( log, " PME grid dimensions %6d %6d %6d\n", gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z); (void) fprintf( log, " PME grid dimensions %6d %6d %6d\n", gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z);
(void) fprintf( log, " cutoffDistance2 %15.7e\n", amoebaGpu->amoebaSim.cutoffDistance2 ); (void) fprintf( log, " nonbondedCutoffSqr %15.7e\n", gpu->sim.nonbondedCutoffSqr);
(void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric ); (void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric );
(void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ); (void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
(void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc ); (void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc );
...@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD ); AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
amoebaGpu->amoebaSim.cutoffDistance2 = cutoffDistance*cutoffDistance;
amoebaGpu->amoebaSim.sqrtPi = std::sqrt( 3.14159265358f ); amoebaGpu->amoebaSim.sqrtPi = std::sqrt( 3.14159265358f );
amoebaGpu->amoebaSim.electric = electricConstant; amoebaGpu->amoebaSim.electric = electricConstant;
amoebaGpu->gpuContext->sim.alphaEwald = alphaEwald; amoebaGpu->gpuContext->sim.alphaEwald = alphaEwald;
...@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){ ...@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
} }
} }
/**---------------------------------------------------------------------------------------
Track iterations for MI dipoles
@param amoebaGpu amoebaGpuContext reference
@param iteration MI iteration
--------------------------------------------------------------------------------------- */
void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
gpu->psInteractingWorkUnit->Download();
gpu->psWorkUnit->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
gpu->psWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
}
gpu->psInteractingWorkUnit->Upload();
gpu->psWorkUnit->Upload();
// ---------------------------------------------------------------------------------------
}
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
...@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation { ...@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
unsigned int numberOfAtoms; // number of atoms unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms unsigned int paddedNumberOfAtoms; // padded number of atoms
float cutoffDistance2; // cutoff distance squared for PME //float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI) float sqrtPi; // sqrt(PI)
float scalingDistanceCutoff; // scaling cutoff float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors float2* pDampingFactorAndThole; // Thole & damping factors
......
...@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu); ...@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
extern "C" extern "C"
void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu); void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
extern "C"
void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
/* /*
extern "C" extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4, void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
......
...@@ -44,6 +44,11 @@ struct FixedFieldParticle { ...@@ -44,6 +44,11 @@ struct FixedFieldParticle {
float gkField[3]; float gkField[3];
#endif #endif
#ifdef INCLUDE_FIXED_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
}; };
__device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI
......
...@@ -24,6 +24,11 @@ struct MutualInducedParticle { ...@@ -24,6 +24,11 @@ struct MutualInducedParticle {
float fieldS[3]; float fieldS[3];
float fieldPolarS[3]; float fieldPolarS[3];
#endif #endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
}; };
__device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI ) __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )
......
...@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel() ...@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2]; multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5]; multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phi = &cAmoebaSim.pPhi[20*i]; float* phi = &cAmoebaSim.pPhi[20*i];
cAmoebaSim.pTorque[3*i] = -cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3] cAmoebaSim.pTorque[3*i] = cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9] + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9]
+ multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5] + multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5]
- multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]); - multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]);
cAmoebaSim.pTorque[3*i+1] = -cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1] cAmoebaSim.pTorque[3*i+1] = cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8] + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8]
+ multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6] + multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6]
- multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]); - multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]);
cAmoebaSim.pTorque[3*i+2] = -cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2] cAmoebaSim.pTorque[3*i+2] = cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7] + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7]
+ multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8] + multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8]
- multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]); - multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]);
...@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel() ...@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY; f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ; f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i]; float4 force = cSim.pForce4[i];
force.x += f.x; force.x -= f.x;
force.y += f.y; force.y -= f.y;
force.z += f.z; force.z -= f.z;
cSim.pForce4[i] = force; cSim.pForce4[i] = force;
...@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel() ...@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2]; multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5]; multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phidp = &cAmoebaSim.pPhidp[20*i]; float* phidp = &cAmoebaSim.pPhidp[20*i];
cAmoebaSim.pTorque[3*i] = -0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3] cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9] + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
+ multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5] + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
- multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]); - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
cAmoebaSim.pTorque[3*i+1] = -0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1] cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8] + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
+ multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6] + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]); - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
cAmoebaSim.pTorque[3*i+2] = -0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2] cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7] + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8] + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
- multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]); - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
...@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel() ...@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY; f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ; f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i]; float4 force = cSim.pForce4[i];
force.x += f.x; force.x -= f.x;
force.y += f.y; force.y -= f.y;
force.z += f.z; force.z -= f.z;
cSim.pForce4[i] = force; cSim.pForce4[i] = force;
} }
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy; cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy;
......
...@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle { ...@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
float torque[3]; float torque[3];
float padding; float padding;
float tempForce[3];
float tempTorque[3];
}; };
__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
atomI.tempForce[0] += atomJ.tempForce[0];
atomI.tempForce[1] += atomJ.tempForce[1];
atomI.tempForce[2] += atomJ.tempForce[2];
atomI.tempTorque[0] += atomJ.tempTorque[0];
atomI.tempTorque[1] += atomJ.tempTorque[1];
atomI.tempTorque[2] += atomJ.tempTorque[2];
}
/* /*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ, __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir ...@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]); float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]); float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
atomI.torque[0] -= term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy); atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
atomI.torque[1] -= term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz); atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] -= term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix); atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
...@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float gfr[8],gfri[7]; float gfr[8],gfri[7];
float gti[7],gtri[7]; float gti[7],gtri[7];
float conversionFactor = (cAmoebaSim.electric/cAmoebaSim.dielec); float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values; // set the permanent multipole and induced dipole values;
...@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2); float r = sqrt(r2);
float ck = atomJ.q; float ck = atomJ.q;
...@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e = e - (1.0f-scalingFactors[MScaleIndex])*erl; e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli; ei = ei - erli;
*energy = conversionFactor*(e + ei); *energy = -conversionFactor*(e + ei);
// increment the total intramolecular energy; assumes; // increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell; // intramolecular distances are less than half of cell;
...@@ -1154,22 +1166,34 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1154,22 +1166,34 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block // on first pass, set threads/block
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
maxThreads = 384; maxThreads = 384;
else if (gpu->sm_version >= SM_12) else if (gpu->sm_version >= SM_12)
maxThreads = 128; maxThreads = 128;
else else
maxThreads = 64; maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
} }
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
(sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
gpu->sim.nonbond_threads_per_block );
(void) fflush( amoebaGpu->log );
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else { } else {
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( // gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0], // amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400; int maxPrint = 5;
float conversion = 1.0f/41.84f; float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f}; float forceSum[3] = { 0.0f, 0.0f, 0.0f};
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
...@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
if( 1 ){ if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" ); (void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
......
...@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned int x; unsigned int x;
unsigned int y; unsigned int y;
bool bExclusionFlag; bool bExclusionFlag;
int dScaleMask;
int2 pScaleMask;
int2 mScaleMask;
// Extract cell coordinates // Extract cell coordinates
...@@ -99,90 +102,60 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -99,90 +102,60 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI ); loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
if (!bExclusionFlag) if (bExclusionFlag)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
}
}
else // bExclusion
{ {
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
float force[3]; float force[3];
float torque[2][3]; float torque[2][3];
unsigned int atomJ = y + j; unsigned int atomJ = y + j;
// set scale factors // set scale factors
if (bExclusionFlag)
{
getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex ); getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex ); getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex ); getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
}
// force // force
float energy; float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j], calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
scalingFactors, force, torque, &energy scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag // by setting match flag
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole // add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle.force[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f; totalEnergy += mask ? 0.5*energy : 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
/* /*
...@@ -229,8 +202,7 @@ if( atomI == targetAtom ){ ...@@ -229,8 +202,7 @@ if( atomI == targetAtom ){
} }
#endif #endif
} } // end of j-loop
}
// include self energy and self torque // include self energy and self torque
...@@ -281,165 +253,79 @@ if( atomI == targetAtom ){ ...@@ -281,165 +253,79 @@ if( atomI == targetAtom ){
outputTorque[offset+2] = localParticle.torque[2]; outputTorque[offset+2] = localParticle.torque[2];
#endif #endif
} } else {
else
{
if (lasty != y)
{
// load shared data
loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) ); unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
} // No interactions in this block.
} else {
sA[threadIdx.x].force[0] = 0.0f; if (lasty != y) {
sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f; // load shared data
sA[threadIdx.x].torque[0] = 0.0f; loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
sA[threadIdx.x].torque[1] = 0.0f;
sA[threadIdx.x].torque[2] = 0.0f; }
if (!bExclusionFlag) sA[threadIdx.x].force[0] = 0.0f;
{ sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f;
sA[threadIdx.x].torque[0] = 0.0f;
sA[threadIdx.x].torque[1] = 0.0f;
sA[threadIdx.x].torque[2] = 0.0f;
if( bExclusionFlag )
{
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
float force[3]; float force[3];
float torque[2][3]; float torque[2][3];
unsigned int atomJ = y + tj;
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[tj],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I // set scale factors
psA[tj].force[0] -= mask ? force[0] : 0.0f; if( bExclusionFlag )
psA[tj].force[1] -= mask ? force[1] : 0.0f; {
psA[tj].force[2] -= mask ? force[2] : 0.0f; getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f; getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f; }
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
// force
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f;
float blockId = 2.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceSign*force[0] : 0.0f;
debugArray[index].y = mask ? forceSign*force[1] : 0.0f;
debugArray[index].z = mask ? forceSign*force[2] : 0.0f;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexI][0] : 0.0f;
debugArray[index].y = mask ? torque[indexI][1] : 0.0f;
debugArray[index].z = mask ? torque[indexI][2] : 0.0f;
debugArray[index].w = mask ? energy : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f;
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f;
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
unsigned int atomJ = y + tj;
// set scale factors
getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
// force
float energy; float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[tj], calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
// check if atoms out-of-bounds // check if atoms out-of-bounds
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f; localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f; localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f; localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
...@@ -448,182 +334,138 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -448,182 +334,138 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom J due atom I // add force and torque to atom J due atom I
psA[tj].force[0] -= mask ? force[0] : 0.0f; if( flags == 0xFFFFFFFF ){
psA[tj].force[1] -= mask ? force[1] : 0.0f;
psA[tj].force[2] -= mask ? force[2] : 0.0f;
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f;
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f;
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG psA[jIdx].force[0] -= mask ? force[0] : 0.0f;
/* psA[jIdx].force[1] -= mask ? force[1] : 0.0f;
energy = mask ? 0.5*energy : 0.0f; psA[jIdx].force[2] -= mask ? force[2] : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack ); psA[jIdx].torque[0] += mask ? torque[1][0] : 0.0f;
} */ psA[jIdx].torque[1] += mask ? torque[1][1] : 0.0f;
if( atomI == targetAtom || atomJ == targetAtom ){ psA[jIdx].torque[2] += mask ? torque[1][2] : 0.0f;
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1; } else {
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f; psA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0];
float blockId = 3.0f; psA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1];
psA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2];
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ; psA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1][0];
debugArray[index].z = (float) y; psA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1][1];
debugArray[index].w = blockId; psA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1][2];
index += cAmoebaSim.paddedNumberOfAtoms; if( tgx % 2 == 0 ){
debugArray[index].x = mask ? forceSign*force[0] : 0.0f; sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
debugArray[index].y = mask ? forceSign*force[1] : 0.0f; }
debugArray[index].z = mask ? forceSign*force[2] : 0.0f; if( tgx % 4 == 0 ){
debugArray[index].w = blockId; sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
index += cAmoebaSim.paddedNumberOfAtoms; if( tgx % 8 == 0 ){
debugArray[index].x = mask ? torque[indexI][0] : 0.0f; sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
debugArray[index].y = mask ? torque[indexI][1] : 0.0f; }
debugArray[index].z = mask ? torque[indexI][2] : 0.0f; if( tgx % 16 == 0 ){
debugArray[index].w = energy; sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f; if (tgx == 0)
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f; {
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f; psA[jIdx].force[0] -= psA[threadIdx.x].tempForce[0] + psA[threadIdx.x+16].tempForce[0];
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x); psA[jIdx].force[1] -= psA[threadIdx.x].tempForce[1] + psA[threadIdx.x+16].tempForce[1];
psA[jIdx].force[2] -= psA[threadIdx.x].tempForce[2] + psA[threadIdx.x+16].tempForce[2];
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms; psA[jIdx].torque[0] += psA[threadIdx.x].tempTorque[0] + psA[threadIdx.x+16].tempTorque[0];
debugArray[index].x = pullBack[pullIndex].x; psA[jIdx].torque[1] += psA[threadIdx.x].tempTorque[1] + psA[threadIdx.x+16].tempTorque[1];
debugArray[index].y = pullBack[pullIndex].y; psA[jIdx].torque[2] += psA[threadIdx.x].tempTorque[2] + psA[threadIdx.x+16].tempTorque[2];
debugArray[index].z = pullBack[pullIndex].z; }
debugArray[index].w = pullBack[pullIndex].w; }
}
tj = (tj + 1) & (GRID - 1);
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[DScaleIndex];
debugArray[index].y = dScaleVal;
debugArray[index].z = scalingFactors[PScaleIndex];
debugArray[index].w = pScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[MScaleIndex];
debugArray[index].y = mScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomI];
debugArray[index].y = labFrameDipole[3*atomI+1];
debugArray[index].z = labFrameDipole[3*atomI+2];
debugArray[index].w = 25.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomJ];
debugArray[index].y = labFrameDipole[3*atomJ+1];
debugArray[index].z = labFrameDipole[3*atomJ+2];
debugArray[index].w = 26.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = jDipole[0];
debugArray[index].y = jDipole[1];
debugArray[index].z = jDipole[2];
debugArray[index].w = 27.0f;
#endif
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += sA[threadIdx.x].force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += sA[threadIdx.x].force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += sA[threadIdx.x].force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += sA[threadIdx.x].torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += sA[threadIdx.x].torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of;
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = sA[threadIdx.x].force[0];
outputForce[offset+1] = sA[threadIdx.x].force[1];
outputForce[offset+2] = sA[threadIdx.x].force[2];
outputTorque[offset] = sA[threadIdx.x].torque[0]; } // end of j-loop
outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2];
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += sA[threadIdx.x].force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += sA[threadIdx.x].force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += sA[threadIdx.x].force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += sA[threadIdx.x].torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += sA[threadIdx.x].torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of;
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = sA[threadIdx.x].force[0];
outputForce[offset+1] = sA[threadIdx.x].force[1];
outputForce[offset+2] = sA[threadIdx.x].force[2];
outputTorque[offset] = sA[threadIdx.x].torque[0];
outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2];
#endif
lasty = y;
#endif } // end of pInteractionFlag block
lasty = y;
} }
pos++; pos++;
} }
//printf( "Hello thread: %d %d %d %d\n", blockIdx.x * blockDim.x + threadIdx.x, blockIdx.x, blockDim.x, threadIdx.x );
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy; cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
} }
...@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned ...@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
} }
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
...@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int ...@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
// Reduce field // Reduce field
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
//const float term = 0.0f;
while (pos < fieldComponents) while (pos < fieldComponents)
{ {
...@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu ) ...@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK #undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h" #include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ, __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float fields[4][3] float dscale, float pscale, float fields[4][3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrtf(r2); float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
...@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fim[0] - fid[0]; fields[0][0] = fim[0] - fid[0];
fields[0][1] = fim[1] - fid[1]; fields[0][1] = fim[1] - fid[1];
...@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields[2][2] = 0.0f; fields[2][2] = 0.0f;
fields[3][2] = 0.0f; fields[3][2] = 0.0f;
} }
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
pullBack[0].x = xr; pullBack[0].x = xr;
pullBack[0].y = yr; pullBack[0].y = yr;
...@@ -399,7 +411,8 @@ static int isNanOrInfinity( double number ){ ...@@ -399,7 +411,8 @@ static int isNanOrInfinity( double number ){
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
{ {
gpuContext gpu = amoebaGpu->gpuContext; static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
static const char* methodName = "computeCudaAmoebaPmeFixedEField"; static const char* methodName = "computeCudaAmoebaPmeFixedEField";
...@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom // print intermediate results for the targetAtom
unsigned int targetAtom = 0; unsigned int targetAtom = 354;
int maxPrint = 3002;
amoebaGpu->psE_Field->Download();
(void) fprintf( amoebaGpu->log, "Recip EFields In\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
int isNan = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n",
amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "Recip EFields End\n" );
#endif #endif
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
#endif #endif
} else { } else {
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( //amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevStream[0], kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
} }
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel"); LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReducePmeDirectE_Fields( amoebaGpu ); kReducePmeDirectE_Fields( amoebaGpu );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
*/ */
amoebaGpu->psE_Field->Download(); amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download(); amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
int maxPrint = 3002;
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
...@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
debugArray->Download(); debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
amoebaGpu->gpuContext->psPosq4->Download();
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj; int debugIndex = jj;
if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
(void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj ); (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
for( int kk = 0; kk < 10; kk++ ){ for( int kk = 0; kk < 6; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w ); debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms; debugIndex += paddedNumberOfAtoms;
} }
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
}
} }
...@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
delete debugArray; delete debugArray;
} }
#endif #endif
if( 0 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( 0 ); fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
......
...@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( ...@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
){ ){
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int pullIndexMax = 12; int maxPullIndex = 1;
float4 pullBack[12]; float4 pullBack[12];
float dScaleVal;
float pScaleVal;
#endif #endif
extern __shared__ FixedFieldParticle sA[]; extern __shared__ FixedFieldParticle sA[];
...@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( ...@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int x; unsigned int x;
unsigned int y; unsigned int y;
bool bExclusionFlag; bool bExclusionFlag;
float dScaleValue;
float pScaleValue;
int dScaleMask;
int2 pScaleMask;
// extract cell coordinates // extract cell coordinates
...@@ -97,148 +99,77 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( ...@@ -97,148 +99,77 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
loadFixedFieldShared( &(sA[threadIdx.x]), atomI ); loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
if (!bExclusionFlag) if( bExclusionFlag ){
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3];
// load coords, charge, ...
#ifdef AMOEBA_DEBUG
dScaleVal = 1.0f;
pScaleVal = 1.0f;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], 1.0f, 1.0f, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int match = (atomI == (y + j)) ? 1 : 0;
match = 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += match ? 0.0f : ijField[0][0];
fieldSum[1] += match ? 0.0f : ijField[0][1];
fieldSum[2] += match ? 0.0f : ijField[0][2];
fieldPolarSum[0] += match ? 0.0f : ijField[2][0];
fieldPolarSum[1] += match ? 0.0f : ijField[2][1];
fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
}
}
else // bExclusion
{
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
for (unsigned int j = 0; j < GRID; j++) }
{
// load coords, charge, ...
float ijField[4][3]; for (unsigned int j = 0; j < GRID; j++)
float dScaleValue; {
float pScaleValue;
if( bExclusionFlag ){
getMaskedDScaleFactor( j, dScaleMask, &dScaleValue ); getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( j, pScaleMask, &pScaleValue ); getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
}
float ijField[4][3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
dScaleVal = dScaleValue; , pullBack
pScaleVal = pScaleValue;
#endif #endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField );
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution unsigned int match = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// by setting match flag
unsigned int match = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0; // add to field at atomI the field due atomJ's charge/dipole/quadrupole
// add to field at atomI the field due atomJ's charge/dipole/quadrupole fieldSum[0] += match ? 0.0f : ijField[0][0];
fieldSum[1] += match ? 0.0f : ijField[0][1];
fieldSum[2] += match ? 0.0f : ijField[0][2];
fieldSum[0] += match ? 0.0f : ijField[0][0]; fieldPolarSum[0] += match ? 0.0f : ijField[2][0];
fieldSum[1] += match ? 0.0f : ijField[0][1]; fieldPolarSum[1] += match ? 0.0f : ijField[2][1];
fieldSum[2] += match ? 0.0f : ijField[0][2]; fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
fieldPolarSum[0] += match ? 0.0f : ijField[2][0];
fieldPolarSum[1] += match ? 0.0f : ijField[2][1];
fieldPolarSum[2] += match ? 0.0f : ijField[2][2];
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom ){ if( atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI; unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0; unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2; unsigned int indexJ = indexI ? 0 : 2;
unsigned int indices[4] = { indexI, indexJ, indexI+1, indexJ+1 }; unsigned int indices[4] = { indexI, indexJ, indexI+1, indexJ+1 };
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j); debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue; debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue; debugArray[index].w = pScaleValue;
float flag = 7.0f;
for( int ii = 0; ii < 4; ii++ ){
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int off = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); debugArray[index].x = match ? 0.0f : ijField[indices[ii]][0];
debugArray[index].x = (float) x; debugArray[index].y = match ? 0.0f : ijField[indices[ii]][1];
debugArray[index].y = (float) tgx; debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2];
debugArray[index].z = -2; debugArray[index].w = flag;
debugArray[index].w = (float) off; }
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
float flag = 7.0f; index += cAmoebaSim.paddedNumberOfAtoms;
for( int ii = 0; ii < 4; ii++ ){ debugArray[index].x = pullBack[pullIndex].x;
index += cAmoebaSim.paddedNumberOfAtoms; debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].x = match ? 0.0f : ijField[indices[ii]][0]; debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].y = match ? 0.0f : ijField[indices[ii]][1]; debugArray[index].w = pullBack[pullIndex].w;
debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2]; }
debugArray[index].w = flag;
}
for( int pullIndex = 0; pullIndex < pullIndexMax; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
} }
#endif #endif
}
} }
// Write results // Write results
...@@ -253,170 +184,54 @@ if( atomI == targetAtom ){ ...@@ -253,170 +184,54 @@ if( atomI == targetAtom ){
load3dArray( offset, fieldPolarSum, outputEFieldPolar ); load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif #endif
} } else {
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
}
// zero shared fields
zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3]; unsigned int flags = cSim.pInteractionFlag[pos];
// flags = 0xFFFFFFFF;
if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y ) {
// load coords, charge, ... // load coordinates, charge, ...
#ifdef AMOEBA_DEBUG loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
dScaleVal = 1.0f;
pScaleVal = 1.0f;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], 1.0f, 1.0f, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0; }
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0][0];
fieldSum[1] += outOfBounds ? 0.0f : ijField[0][1];
fieldSum[2] += outOfBounds ? 0.0f : ijField[0][2];
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[2][0];
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1];
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[tj].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[tj].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[tj].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
psA[tj].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[tj].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[tj].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
unsigned int pullBackIndex = 0;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
float flag = 8.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
#if 0 // zero shared fields
index += cAmoebaSim.paddedNumberOfAtoms; zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleValue + 10.0f;
#endif
}
#endif
tj = (tj + 1) & (GRID - 1);
if( bExclusionFlag ) {
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
} }
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x >> GRIDBITS; for (unsigned int j = 0; j < GRID; j++){
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++) unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
{ if( bExclusionFlag ){
// load coords, charge, ... getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
float ijField[4][3]; float ijField[4][3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
float dScaleValue;
float pScaleValue;
getMaskedDScaleFactor( tj, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( tj, pScaleMask, &pScaleValue );
#ifdef AMOEBA_DEBUG
dScaleVal = dScaleValue;
pScaleVal = pScaleValue;
#endif
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[tj], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0; unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole // add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0][0]; fieldSum[0] += outOfBounds ? 0.0f : ijField[0][0];
fieldSum[1] += outOfBounds ? 0.0f : ijField[0][1]; fieldSum[1] += outOfBounds ? 0.0f : ijField[0][1];
fieldSum[2] += outOfBounds ? 0.0f : ijField[0][2]; fieldSum[2] += outOfBounds ? 0.0f : ijField[0][2];
...@@ -424,104 +239,128 @@ pScaleVal = pScaleValue; ...@@ -424,104 +239,128 @@ pScaleVal = pScaleValue;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[2][0]; fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[2][0];
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1]; fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[2][1];
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2]; fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2][2];
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole // add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[tj].eField[0] += outOfBounds ? 0.0f : ijField[1][0]; psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[1][0];
psA[tj].eField[1] += outOfBounds ? 0.0f : ijField[1][1]; psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1][1];
psA[tj].eField[2] += outOfBounds ? 0.0f : ijField[1][2]; psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[1][2];
psA[tj].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0]; psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[3][0];
psA[tj].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1]; psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[3][1];
psA[tj].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2]; psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[3][2];
} else {
psA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + tj) == targetAtom) ){ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI; unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2; unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0; unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj); debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = dScaleVal; debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleVal; debugArray[index].w = pScaleValue;
unsigned int pullBackIndex = 0;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;;
float flag = 9.0f; float flag = 9.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI][2];
debugArray[index].w = flag; debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0]; debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1]; debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2]; debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ][2];
debugArray[index].w = flag; debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0]; debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1]; debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2]; debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI+1][2];
debugArray[index].w = flag; debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0]; debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1]; debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2]; debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ+1][2];
debugArray[index].w = flag; debugArray[index].w = flag;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
} index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif #endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
}
}
// Write results
} // j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField ); load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar ); load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField ); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar ); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField ); load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar ); load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField ); load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar ); load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#endif #endif
} // end of pInteractionFlag block
lasty = y; lasty = y;
} } // x == y block
pos++; pos++;
} }
......
...@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu) ...@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG //#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h" #include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fimd[0] - fid[0]; fields[0][0] = fimd[0] - fid[0];
fields[1][0] = fkmd[0] - fkd[0]; fields[1][0] = fkmd[0] - fkd[0];
...@@ -370,7 +384,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -370,7 +384,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray ) CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{ {
gpuContext gpu = amoebaGpu->gpuContext; static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int targetAtom = 0; int targetAtom = 0;
...@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( //gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0], //amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
} }
#endif #endif
iteration++; iteration++;
...@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done; amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1; amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
if( 0 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
......
...@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j); debugArray[index].y = (float) (y + j);
debugArray[index].z = cAmoebaSim.cutoffDistance2; debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f; debugArray[index].w = 6.0f;
...@@ -209,72 +209,114 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -209,72 +209,114 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif #endif
} } else {
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3]; unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// load coords, charge, ... // No interactions in this block.
} else {
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField if (lasty != y)
{
unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float ijField[4][3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole // add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0][0] : 0.0f; fieldSum[0] += mask ? ijField[0][0] : 0.0f;
fieldSum[1] += mask ? ijField[0][1] : 0.0f; fieldSum[1] += mask ? ijField[0][1] : 0.0f;
fieldSum[2] += mask ? ijField[0][2] : 0.0f; fieldSum[2] += mask ? ijField[0][2] : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
// add to field at atomJ the field due atomI's dipole
if( flags == 0xFFFFFFFF ){
psA[jIdx].field[0] += mask ? ijField[1][0] : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1][1] : 0.0f;
psA[jIdx].field[2] += mask ? ijField[1][2] : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[jIdx].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[jIdx].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[jIdx].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
} else {
psA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
}
}
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
// add to field at atomJ the field due atomI's dipole
psA[tj].field[0] += mask ? ijField[1][0] : 0.0f;
psA[tj].field[1] += mask ? ijField[1][1] : 0.0f;
psA[tj].field[2] += mask ? ijField[1][2] : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[tj].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+tj) : atomI; unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0; unsigned int pullBackIndex = 0;
unsigned int indexI = 0; unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2; unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj); debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cAmoebaSim.cutoffDistance2; debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f; debugArray[index].w = 7.0f;
...@@ -315,57 +357,40 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){ ...@@ -315,57 +357,40 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray[index].y = ijField[indexJ+1][1]; debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2]; debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag; debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
} }
#endif #endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} } // end of j-loop
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField ); load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar); load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField ); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField ); load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar); load3dArray( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].field, outputField ); load3dArray( offset, sA[threadIdx.x].field, outputField );
load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar); load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#endif #endif
lasty = y; lasty = y;
}
} // end of pInteractionFlag block
} // end of x == y block
pos++; pos++;
} }
} }
...@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu, ...@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
amoebaGpu->psForce->Download(); amoebaGpu->psForce->Download();
psCudaForce4->Download(); psCudaForce4->Download();
amoebaGpu->torqueMapForce->Download();
amoebaGpu->psTorque->Download(); amoebaGpu->psTorque->Download();
int maxPrint = 10; int maxPrint = 10;
(void) fprintf( amoebaGpu->log,"Post torqueMap\n" ); (void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
...@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu, ...@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaGpu->psForce->_pSysStream[0][indexOffset], amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1], amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] ); amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset], amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1], amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
...@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu, ...@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaGpu->maxMapTorqueDifference, amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0], amoebaGpu->torqueMapForce->_pDevStream[0],
psCudaForce4->_pDevStream[0] ); psCudaForce4->_pDevStream[0] );
LAUNCHERROR("amoebaMapTorqueReduce_kernel2"); LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
......
...@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
} }
#undef USE_PERIODIC
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef USE_PERIODIC
#undef METHOD_NAME
void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood )
{ {
std::string methodName = "kCalculateAmoebaMultipoleForces"; std::string methodName = "kCalculateAmoebaMultipoleForces";
...@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEField( amoebaGpu ); cudaComputeAmoebaFixedEField( amoebaGpu );
cudaComputeAmoebaMutualInducedField( amoebaGpu ); cudaComputeAmoebaMutualInducedField( amoebaGpu );
} else { } else {
gpuContext gpu = amoebaGpu->gpuContext;
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if( 0 ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "Cell %8u %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " %8u [%5u %5u %1u] %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
}
} else {
}
cudaComputeAmoebaPmeFixedEField( amoebaGpu ); cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu ); cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
} }
......
...@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete ...@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
MapStringDouble tinkerEnergies; MapStringDouble tinkerEnergies;
MapStringVectorOfVectors supplementary; MapStringVectorOfVectors supplementary;
MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE ); MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE );
bool gkIsActive; bool gkIsActive;
if( isPresent != forceMap.end() && isPresent->second != 0 ){ if( isPresent != forceMap.end() && isPresent->second != 0 ){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment