Commit a9054686 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods for direct PME

parent 01260070
...@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor ...@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
log = NULL; log = NULL;
contextImpl = NULL; contextImpl = NULL;
gpuInitialized = false; gpuInitialized = false;
applyCutoff = 0;
multipoleForceCount = 0;
} }
AmoebaCudaData::~AmoebaCudaData() { AmoebaCudaData::~AmoebaCudaData() {
...@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) { ...@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
return; return;
} }
void AmoebaCudaData::incrementMultipoleForceCount( void ) {
multipoleForceCount++;
}
int AmoebaCudaData::getMultipoleForceCount( void ) const {
return multipoleForceCount;
}
void AmoebaCudaData::setApplyCutoff( int inputApplyCutoff ) {
applyCutoff = inputApplyCutoff;
}
int AmoebaCudaData::getApplyCutoff( void ) const {
return applyCutoff;
}
} }
...@@ -139,11 +139,41 @@ public: ...@@ -139,11 +139,41 @@ public:
*/ */
void setContextImpl( void* contextImpl ); void setContextImpl( void* contextImpl );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getMultipoleForceCount( void ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void incrementMultipoleForceCount( void );
/**
* Get multipole force count
*
* @return multipole force count
*/
int getApplyCutoff( ) const;
/**
* Get multipole force count
*
* @return multipole force count
*/
void setApplyCutoff( int applyCutoff );
private: private:
CudaPlatform::PlatformData& cudaPlatformData; CudaPlatform::PlatformData& cudaPlatformData;
amoebaGpuContext amoebaGpu; amoebaGpuContext amoebaGpu;
bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole; bool hasAmoebaBonds, hasAmoebaGeneralizedKirkwood, hasAmoebaMultipole;
int multipoleForceCount;
int applyCutoff;
KernelImpl* localForceKernel; KernelImpl* localForceKernel;
unsigned int kernelCount; unsigned int kernelCount;
void* contextImpl; void* contextImpl;
......
...@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo ...@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
amoebaGpuContext gpu = data.getAmoebaGpu(); amoebaGpuContext gpu = data.getAmoebaGpu();
if( data.getMultipoleForceCount() == 0 ){
gpuCopyInteractingWorkUnit( gpu );
}
if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
gpuReorderAtoms(gpu->gpuContext);
}
data.incrementMultipoleForceCount();
data.initializeGpu(); data.initializeGpu();
if( 0 && data.getLog() ){ if( 0 && data.getLog() ){
...@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize = pmeGridDimension[2]; zsize = pmeGridDimension[2];
} }
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize); gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
data.setApplyCutoff( 1 );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.nonbondedCutoffSqr = force.getCutoffDistance()*force.getCutoffDistance();
gpu->sim.nonbondedMethod = PARTICLE_MESH_EWALD;
} }
data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force)); data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
} }
......
...@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi ); (void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi );
(void) fprintf( log, " alpha Ewald %15.7e\n", gpu->sim.alphaEwald ); (void) fprintf( log, " alpha Ewald %15.7e\n", gpu->sim.alphaEwald );
(void) fprintf( log, " PME grid dimensions %6d %6d %6d\n", gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z); (void) fprintf( log, " PME grid dimensions %6d %6d %6d\n", gpu->sim.pmeGridSize.x, gpu->sim.pmeGridSize.y, gpu->sim.pmeGridSize.z);
(void) fprintf( log, " cutoffDistance2 %15.7e\n", amoebaGpu->amoebaSim.cutoffDistance2 ); (void) fprintf( log, " nonbondedCutoffSqr %15.7e\n", gpu->sim.nonbondedCutoffSqr);
(void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric ); (void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric );
(void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ); (void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
(void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc ); (void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc );
...@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD ); AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
amoebaGpu->amoebaSim.cutoffDistance2 = cutoffDistance*cutoffDistance;
amoebaGpu->amoebaSim.sqrtPi = std::sqrt( 3.14159265358f ); amoebaGpu->amoebaSim.sqrtPi = std::sqrt( 3.14159265358f );
amoebaGpu->amoebaSim.electric = electricConstant; amoebaGpu->amoebaSim.electric = electricConstant;
amoebaGpu->gpuContext->sim.alphaEwald = alphaEwald; amoebaGpu->gpuContext->sim.alphaEwald = alphaEwald;
...@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){ ...@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
} }
} }
/**---------------------------------------------------------------------------------------
Track iterations for MI dipoles
@param amoebaGpu amoebaGpuContext reference
@param iteration MI iteration
--------------------------------------------------------------------------------------- */
void gpuCopyInteractingWorkUnit( amoebaGpuContext amoebaGpu ){
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
gpu->psInteractingWorkUnit->Download();
gpu->psWorkUnit->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
gpu->psWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
}
gpu->psInteractingWorkUnit->Upload();
gpu->psWorkUnit->Upload();
// ---------------------------------------------------------------------------------------
}
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
...@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation { ...@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
unsigned int numberOfAtoms; // number of atoms unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms unsigned int paddedNumberOfAtoms; // padded number of atoms
float cutoffDistance2; // cutoff distance squared for PME //float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI) float sqrtPi; // sqrt(PI)
float scalingDistanceCutoff; // scaling cutoff float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors float2* pDampingFactorAndThole; // Thole & damping factors
......
...@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu); ...@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
extern "C" extern "C"
void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu); void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
extern "C"
void gpuCopyInteractingWorkUnit(amoebaGpuContext gpu);
/* /*
extern "C" extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4, void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
......
...@@ -44,6 +44,11 @@ struct FixedFieldParticle { ...@@ -44,6 +44,11 @@ struct FixedFieldParticle {
float gkField[3]; float gkField[3];
#endif #endif
#ifdef INCLUDE_FIXED_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
}; };
__device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI __device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI
......
...@@ -24,6 +24,11 @@ struct MutualInducedParticle { ...@@ -24,6 +24,11 @@ struct MutualInducedParticle {
float fieldS[3]; float fieldS[3];
float fieldPolarS[3]; float fieldPolarS[3];
#endif #endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
float tempBuffer[3];
float tempBufferP[3];
#endif
}; };
__device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI ) __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )
......
...@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel() ...@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2]; multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5]; multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phi = &cAmoebaSim.pPhi[20*i]; float* phi = &cAmoebaSim.pPhi[20*i];
cAmoebaSim.pTorque[3*i] = -cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3] cAmoebaSim.pTorque[3*i] = cAmoebaSim.electric*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9] + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phi[9]
+ multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5] + multipole[8]*yscale*yscale*phi[7] + multipole[9]*xscale*yscale*phi[5]
- multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]); - multipole[7]*yscale*zscale*phi[8] - multipole[9]*xscale*zscale*phi[6]);
cAmoebaSim.pTorque[3*i+1] = -cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1] cAmoebaSim.pTorque[3*i+1] = cAmoebaSim.electric*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8] + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phi[8]
+ multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6] + multipole[7]*zscale*zscale*phi[9] + multipole[8]*xscale*zscale*phi[6]
- multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]); - multipole[8]*xscale*xscale*phi[4] - multipole[9]*yscale*yscale*phi[7]);
cAmoebaSim.pTorque[3*i+2] = -cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2] cAmoebaSim.pTorque[3*i+2] = cAmoebaSim.electric*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7] + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phi[7]
+ multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8] + multipole[7]*xscale*xscale*phi[4] + multipole[9]*yscale*zscale*phi[8]
- multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]); - multipole[7]*xscale*yscale*phi[5] - multipole[8]*zscale*zscale*phi[9]);
...@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel() ...@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY; f.y *= cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ; f.z *= cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i]; float4 force = cSim.pForce4[i];
force.x += f.x; force.x -= f.x;
force.y += f.y; force.y -= f.y;
force.z += f.z; force.z -= f.z;
cSim.pForce4[i] = force; cSim.pForce4[i] = force;
...@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel() ...@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2]; multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5]; multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phidp = &cAmoebaSim.pPhidp[20*i]; float* phidp = &cAmoebaSim.pPhidp[20*i];
cAmoebaSim.pTorque[3*i] = -0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3] cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9] + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
+ multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5] + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
- multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]); - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
cAmoebaSim.pTorque[3*i+1] = -0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1] cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8] + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
+ multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6] + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]); - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
cAmoebaSim.pTorque[3*i+2] = -0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2] cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7] + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8] + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
- multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]); - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
...@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel() ...@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY; f.y *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ; f.z *= 0.5f*cAmoebaSim.electric*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
float4 force = cSim.pForce4[i]; float4 force = cSim.pForce4[i];
force.x += f.x; force.x -= f.x;
force.y += f.y; force.y -= f.y;
force.z += f.z; force.z -= f.z;
cSim.pForce4[i] = force; cSim.pForce4[i] = force;
} }
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy; cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*cAmoebaSim.electric*energy;
......
...@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle { ...@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
float torque[3]; float torque[3];
float padding; float padding;
float tempForce[3];
float tempTorque[3];
}; };
__device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ ){
atomI.tempForce[0] += atomJ.tempForce[0];
atomI.tempForce[1] += atomJ.tempForce[1];
atomI.tempForce[2] += atomJ.tempForce[2];
atomI.tempTorque[0] += atomJ.tempTorque[0];
atomI.tempTorque[1] += atomJ.tempTorque[1];
atomI.tempTorque[2] += atomJ.tempTorque[2];
}
/* /*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ, __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir ...@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]); float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]); float uiz = 0.5f*(atomI.inducedDipole[2] + atomI.inducedDipoleP[2]);
atomI.torque[0] -= term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy); atomI.torque[0] += term*(atomI.labFrameDipole[1]*uiz - atomI.labFrameDipole[2]*uiy);
atomI.torque[1] -= term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz); atomI.torque[1] += term*(atomI.labFrameDipole[2]*uix - atomI.labFrameDipole[0]*uiz);
atomI.torque[2] -= term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix); atomI.torque[2] += term*(atomI.labFrameDipole[0]*uiy - atomI.labFrameDipole[1]*uix);
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
...@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float gfr[8],gfri[7]; float gfr[8],gfri[7];
float gti[7],gtri[7]; float gti[7],gtri[7];
float conversionFactor = (cAmoebaSim.electric/cAmoebaSim.dielec); float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values; // set the permanent multipole and induced dipole values;
...@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrt(r2); float r = sqrt(r2);
float ck = atomJ.q; float ck = atomJ.q;
...@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e = e - (1.0f-scalingFactors[MScaleIndex])*erl; e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli; ei = ei - erli;
*energy = conversionFactor*(e + ei); *energy = -conversionFactor*(e + ei);
// increment the total intramolecular energy; assumes; // increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell; // intramolecular distances are less than half of cell;
...@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
maxThreads = 128; maxThreads = 128;
else else
maxThreads = 64; maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
} }
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
(sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
gpu->sim.nonbond_threads_per_block );
(void) fflush( amoebaGpu->log );
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else { } else {
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( // gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0], // amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400; int maxPrint = 5;
float conversion = 1.0f/41.84f; float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f}; float forceSum[3] = { 0.0f, 0.0f, 0.0f};
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
...@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
if( 1 ){ if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" ); (void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
......
...@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned ...@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
} }
} }
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
...@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int ...@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
// Reduce field // Reduce field
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi; const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
//const float term = 0.0f;
while (pos < fieldComponents) while (pos < fieldComponents)
{ {
...@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu ) ...@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK #undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h" #include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ, __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float fields[4][3] float dscale, float pscale, float fields[4][3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY; yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ; zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr; float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrtf(r2); float r = sqrtf(r2);
// calculate the error function damping terms // calculate the error function damping terms
...@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fim[0] - fid[0]; fields[0][0] = fim[0] - fid[0];
fields[0][1] = fim[1] - fid[1]; fields[0][1] = fim[1] - fid[1];
...@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields[2][2] = 0.0f; fields[2][2] = 0.0f;
fields[3][2] = 0.0f; fields[3][2] = 0.0f;
} }
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
pullBack[0].x = xr; pullBack[0].x = xr;
pullBack[0].y = yr; pullBack[0].y = yr;
...@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){ ...@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
{ {
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom // print intermediate results for the targetAtom
unsigned int targetAtom = 0; unsigned int targetAtom = 354;
int maxPrint = 3002;
amoebaGpu->psE_Field->Download();
(void) fprintf( amoebaGpu->log, "Recip EFields In\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
int isNan = isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1] );
isNan += isNanOrInfinity( amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] %s\n",
amoebaGpu->psE_Field->_pSysStream[0][indexOffset],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+1],
amoebaGpu->psE_Field->_pSysStream[0][indexOffset+2], (isNan ? "XXX" :"") );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "Recip EFields End\n" );
#endif #endif
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
#endif #endif
} else { } else {
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( //amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevStream[0], kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
} }
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel"); LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReducePmeDirectE_Fields( amoebaGpu ); kReducePmeDirectE_Fields( amoebaGpu );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
*/ */
amoebaGpu->psE_Field->Download(); amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download(); amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
int maxPrint = 3002;
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
...@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
debugArray->Download(); debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
amoebaGpu->gpuContext->psPosq4->Download();
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj; int debugIndex = jj;
if( fabs(debugArray->_pSysStream[0][jj+paddedNumberOfAtoms].x) > 0.0 ){
(void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj ); (void) fprintf( amoebaGpu->log,"%5d PmeFixedEField\n", jj );
for( int kk = 0; kk < 10; kk++ ){ for( int kk = 0; kk < 6; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w ); debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms; debugIndex += paddedNumberOfAtoms;
} }
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p\n",
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y,
amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].x - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].x)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].y - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].y)/5.50f,
(amoebaGpu->gpuContext->psPosq4->_pSysStream[0][jj].z - amoebaGpu->gpuContext->psPosq4->_pSysStream[0][0].z)/5.50f);
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
}
} }
...@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
delete debugArray; delete debugArray;
} }
#endif #endif
if( 0 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( 0 ); fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
......
...@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu) ...@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG //#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h" #include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction // increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){ if( r2 <= cSim.nonbondedCutoffSqr ){
fields[0][0] = fimd[0] - fid[0]; fields[0][0] = fimd[0] - fid[0];
fields[1][0] = fkmd[0] - fkd[0]; fields[1][0] = fkmd[0] - fkd[0];
...@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray ) CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{ {
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( //gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0], //amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" ); (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
} }
#endif #endif
iteration++; iteration++;
...@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done; amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1; amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
if( 0 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
......
...@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j); debugArray[index].y = (float) (y + j);
debugArray[index].z = cAmoebaSim.cutoffDistance2; debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 6.0f; debugArray[index].w = 6.0f;
...@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif #endif
} } else {
else // 100% utilization
{ unsigned int flags = cSim.pInteractionFlag[pos];
// Read fixed atom data into registers and GRF if (flags == 0) {
// No interactions in this block.
} else {
if (lasty != y) if (lasty != y)
{ {
unsigned int atomJ = y + tgx; unsigned int atomJ = y + tgx;
...@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float ijField[4][3]; float ijField[4][3];
// load coords, charge, ... // load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole // add to field at atomI the field due atomJ's dipole
...@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// add to field at atomJ the field due atomI's dipole // add to field at atomJ the field due atomI's dipole
psA[tj].field[0] += mask ? ijField[1][0] : 0.0f; if( flags == 0xFFFFFFFF ){
psA[tj].field[1] += mask ? ijField[1][1] : 0.0f;
psA[tj].field[2] += mask ? ijField[1][2] : 0.0f; psA[jIdx].field[0] += mask ? ijField[1][0] : 0.0f;
psA[jIdx].field[1] += mask ? ijField[1][1] : 0.0f;
psA[jIdx].field[2] += mask ? ijField[1][2] : 0.0f;
// add to polar field at atomJ the field due atomI's dipole // add to polar field at atomJ the field due atomI's dipole
psA[tj].fieldPolar[0] += mask ? ijField[3][0] : 0.0f; psA[jIdx].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f; psA[jIdx].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f; psA[jIdx].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
} else {
psA[threadIdx.x].tempBuffer[0] = mask ? 0.0f : ijField[1][0];
psA[threadIdx.x].tempBuffer[1] = mask ? 0.0f : ijField[1][1];
psA[threadIdx.x].tempBuffer[2] = mask ? 0.0f : ijField[1][2];
psA[threadIdx.x].tempBufferP[0] = mask ? 0.0f : ijField[3][0];
psA[threadIdx.x].tempBufferP[1] = mask ? 0.0f : ijField[3][1];
psA[threadIdx.x].tempBufferP[2] = mask ? 0.0f : ijField[3][2];
if( tgx % 2 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( psA[threadIdx.x], psA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].field[0] += psA[threadIdx.x].tempBuffer[0] + psA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].field[1] += psA[threadIdx.x].tempBuffer[1] + psA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += psA[threadIdx.x].tempBuffer[2] + psA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += psA[threadIdx.x].tempBufferP[0] + psA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += psA[threadIdx.x].tempBufferP[1] + psA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += psA[threadIdx.x].tempBufferP[2] + psA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+tj) : atomI; unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
unsigned int pullBackIndex = 0; unsigned int pullBackIndex = 0;
unsigned int indexI = 0; unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2; unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj); debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = cAmoebaSim.cutoffDistance2; debugArray[index].z = cSim.nonbondedCutoffSqr;
debugArray[index].w = 7.0f; debugArray[index].w = 7.0f;
...@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){ ...@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray[index].y = ijField[indexJ+1][1]; debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2]; debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag; debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
} }
#endif #endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} } // end of j-loop
// Write results // Write results
...@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){ ...@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
#endif #endif
lasty = y; lasty = y;
}
} // end of pInteractionFlag block
} // end of x == y block
pos++; pos++;
} }
} }
...@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu, ...@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "%s: numBlocks=%d numThreads=%d %d\n", methodName, numBlocks, numThreads, amoebaGpu->maxMapTorqueDifferencePow2); (void) fflush( amoebaGpu->log );
amoebaGpu->psForce->Download(); amoebaGpu->psForce->Download();
psCudaForce4->Download(); psCudaForce4->Download();
amoebaGpu->torqueMapForce->Download();
amoebaGpu->psTorque->Download(); amoebaGpu->psTorque->Download();
int maxPrint = 10; int maxPrint = 10;
(void) fprintf( amoebaGpu->log,"Post torqueMap\n" ); (void) fprintf( amoebaGpu->log,"Post torqueMap\n" );
...@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu, ...@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaGpu->psForce->_pSysStream[0][indexOffset], amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1], amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] ); amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"fT[%16.9e %16.9e %16.9e] ",
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+1],
amoebaGpu->torqueMapForce->_pSysStream[0][indexOffset+2] );
(void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"T[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psTorque->_pSysStream[0][indexOffset], amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1], amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
...@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu, ...@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaGpu->maxMapTorqueDifference, amoebaGpu->maxMapTorqueDifference,
amoebaGpu->torqueMapForce->_pDevStream[0], amoebaGpu->torqueMapForce->_pDevStream[0],
psCudaForce4->_pDevStream[0] ); psCudaForce4->_pDevStream[0] );
LAUNCHERROR("amoebaMapTorqueReduce_kernel2"); LAUNCHERROR("amoebaMapTorqueReduce_kernel3");
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
......
...@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
} }
#undef USE_PERIODIC
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef USE_PERIODIC
#undef METHOD_NAME
void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood )
{ {
std::string methodName = "kCalculateAmoebaMultipoleForces"; std::string methodName = "kCalculateAmoebaMultipoleForces";
...@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEField( amoebaGpu ); cudaComputeAmoebaFixedEField( amoebaGpu );
cudaComputeAmoebaMutualInducedField( amoebaGpu ); cudaComputeAmoebaMutualInducedField( amoebaGpu );
} else { } else {
gpuContext gpu = amoebaGpu->gpuContext;
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if( 0 ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "Cell %8u %8u [%5u %5u %1u] ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions );
x = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " %8u [%5u %5u %1u] %10u\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
}
} else {
}
cudaComputeAmoebaPmeFixedEField( amoebaGpu ); cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu ); cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
} }
......
...@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete ...@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
MapStringDouble tinkerEnergies; MapStringDouble tinkerEnergies;
MapStringVectorOfVectors supplementary; MapStringVectorOfVectors supplementary;
MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE ); MapStringIntI isPresent = forceMap.find( AMOEBA_GK_FORCE );
bool gkIsActive; bool gkIsActive;
if( isPresent != forceMap.end() && isPresent->second != 0 ){ if( isPresent != forceMap.end() && isPresent->second != 0 ){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment