Commit 4d763c4d authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Fixed bug in direct PME

Added diagnostics for handling reorder of atoms
parent 8b8defe8
...@@ -38,7 +38,7 @@ using namespace OpenMM; ...@@ -38,7 +38,7 @@ using namespace OpenMM;
using std::string; using std::string;
using std::vector; using std::vector;
AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), pmeBSplineOrder(5), cutoffDistance(1.0), ewaldErrorTol(5e-4), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60), AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), pmeBSplineOrder(5), cutoffDistance(1.0), ewaldErrorTol(1e-4), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60),
mutualInducedTargetEpsilon(1.0e-05), scalingDistanceCutoff(100.0), electricConstant(138.9354558456) { mutualInducedTargetEpsilon(1.0e-05), scalingDistanceCutoff(100.0), electricConstant(138.9354558456) {
} }
......
...@@ -779,17 +779,15 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { ...@@ -779,17 +779,15 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
if( data.getMultipoleForceCount() == 0 ){ if( data.getMultipoleForceCount() == 0 ){
gpuCopyWorkUnit( gpu ); gpuCopyWorkUnit( gpu );
} }
//if( data.getApplyCutoff() && (data.getMultipoleForceCount() % 100) == 0 ){
//gpuReorderAtoms(gpu->gpuContext);
//}
data.incrementMultipoleForceCount(); data.incrementMultipoleForceCount();
data.initializeGpu();
if( 0 && data.getLog() ){ if( 0 && data.getLog() ){
(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce\n" ); (void) fprintf( data.getLog(), "In computeAmoebaMultipoleForce\n" );
(void) fflush( data.getLog()); (void) fflush( data.getLog());
} }
data.initializeGpu();
// calculate Born radii // calculate Born radii
if( data.getHasAmoebaGeneralizedKirkwood() ){ if( data.getHasAmoebaGeneralizedKirkwood() ){
...@@ -974,23 +972,27 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -974,23 +972,27 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
nb.setCutoffDistance(force.getCutoffDistance()); nb.setCutoffDistance(force.getCutoffDistance());
std::vector<int> pmeGridDimension; std::vector<int> pmeGridDimension;
force.getPmeGridDimensions( pmeGridDimension ); force.getPmeGridDimensions( pmeGridDimension );
if( 1 || pmeGridDimension[0] == 0 ){ int pmeParametersSetBasedOnEwaldErrorTolerance;
if( pmeGridDimension[0] == 0 ){
NonbondedForceImpl::calcPMEParameters(system, nb, alpha, xsize, ysize, zsize); NonbondedForceImpl::calcPMEParameters(system, nb, alpha, xsize, ysize, zsize);
/* pmeParametersSetBasedOnEwaldErrorTolerance = 1;
alpha = 5.446;
xsize = 60;
ysize = 48;
zsize = 48;
*/
} else { } else {
alpha = force.getAEwald(); alpha = force.getAEwald();
xsize = pmeGridDimension[0]; xsize = pmeGridDimension[0];
ysize = pmeGridDimension[1]; ysize = pmeGridDimension[1];
zsize = pmeGridDimension[2]; zsize = pmeGridDimension[2];
pmeParametersSetBasedOnEwaldErrorTolerance = 0;
} }
if( data.getLog() ){ if( data.getLog() ){
(void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d]\n", (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d] -",
force.getEwaldErrorTolerance(), force.getCutoffDistance(), alpha, xsize, ysize, zsize ); force.getEwaldErrorTolerance(), force.getCutoffDistance(), alpha, xsize, ysize, zsize );
if( pmeParametersSetBasedOnEwaldErrorTolerance ){
(void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
} else {
double impliedTolerance = alpha*force.getCutoffDistance();
impliedTolerance = 0.5*exp( -(impliedTolerance*impliedTolerance) );
(void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e\n", impliedTolerance );
}
(void) fflush( data.getLog() ); (void) fflush( data.getLog() );
} }
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize); gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
......
...@@ -3909,6 +3909,33 @@ void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId ...@@ -3909,6 +3909,33 @@ void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId
(void) fclose( filePtr ); (void) fclose( filePtr );
} }
CUDAStream<float>* reorderFloat( amoebaGpuContext amoebaGpu, CUDAStream<float>* arrayToReorder ){
gpuContext gpu = amoebaGpu->gpuContext;
CUDAStream<float>* reorderdArray = new CUDAStream<float>(amoebaGpu->gpuContext->sim.paddedNumberOfAtoms, 1, "TempReorder");
int* order = gpu->psAtomIndex->_pSysData;
for( int ii = 0; ii < gpu->natoms; ii++ ){
reorderdArray->_pSysStream[0][order[ii]] = arrayToReorder->_pSysStream[0][ii];
}
return reorderdArray;
}
CUDAStream<float4>* reorderFloat4( amoebaGpuContext amoebaGpu, CUDAStream<float4>* arrayToReorder ){
gpuContext gpu = amoebaGpu->gpuContext;
CUDAStream<float4>* reorderdArray = new CUDAStream<float4>(amoebaGpu->gpuContext->sim.paddedNumberOfAtoms, 1, "TempReorder4");
int* order = gpu->psAtomIndex->_pSysData;
for( int ii = 0; ii < gpu->natoms; ii++ ){
reorderdArray->_pSysStream[0][order[ii]].x = arrayToReorder->_pSysStream[0][ii].x;
reorderdArray->_pSysStream[0][order[ii]].y = arrayToReorder->_pSysStream[0][ii].y;
reorderdArray->_pSysStream[0][order[ii]].z = arrayToReorder->_pSysStream[0][ii].z;
reorderdArray->_pSysStream[0][order[ii]].w = arrayToReorder->_pSysStream[0][ii].w;
}
return reorderdArray;
}
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Load contents of arrays into vector Load contents of arrays into vector
...@@ -3920,7 +3947,9 @@ void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId ...@@ -3920,7 +3947,9 @@ void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector ) void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle,
CUDAStream<float>* array, VectorOfDoubleVectors& outputVector,
int* order )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
...@@ -3929,14 +3958,19 @@ void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDA ...@@ -3929,14 +3958,19 @@ void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDA
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
array->Download(); array->Download();
int runningIndex = 0; int orderIndex = 0;
outputVector.resize( numberOfParticles ); outputVector.resize( numberOfParticles );
for( int ii = 0; ii < numberOfParticles; ii++ ){ for( int ii = 0; ii < numberOfParticles; ii++ ){
if( order ){
orderIndex = order[ii];
} else {
orderIndex = ii;
}
for( int jj = 0; jj < entriesPerParticle; jj++ ) { for( int jj = 0; jj < entriesPerParticle; jj++ ) {
outputVector[ii].push_back( array->_pSysStream[0][runningIndex++] ); outputVector[orderIndex].push_back( array->_pSysStream[0][entriesPerParticle*ii+jj] );
} }
} }
} }
...@@ -3975,6 +4009,7 @@ void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUD ...@@ -3975,6 +4009,7 @@ void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUD
} }
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Load contents of arrays into vector Load contents of arrays into vector
...@@ -3983,10 +4018,12 @@ void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUD ...@@ -3983,10 +4018,12 @@ void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUD
@param entriesPerParticle entries/particles array @param entriesPerParticle entries/particles array
@param array cuda array @param array cuda array
@param outputVector output vector @param outputVector output vector
@param order if set, reorder entries
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector ) void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array,
VectorOfDoubleVectors& outputVector, int* order )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
...@@ -3996,21 +4033,27 @@ void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUD ...@@ -3996,21 +4033,27 @@ void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUD
array->Download(); array->Download();
int runningIndex = 0; int runningIndex = 0;
int orderIndex;
outputVector.resize( numberOfParticles ); outputVector.resize( numberOfParticles );
for( int ii = 0; ii < numberOfParticles; ii++ ){ for( int ii = 0; ii < numberOfParticles; ii++ ){
if( order ){
orderIndex = order[runningIndex];
} else {
orderIndex = runningIndex;
}
if( entriesPerParticle > 0 ){ if( entriesPerParticle > 0 ){
outputVector[ii].push_back( array->_pSysStream[0][runningIndex].x ); outputVector[orderIndex].push_back( array->_pSysStream[0][ii].x );
} }
if( entriesPerParticle > 1 ){ if( entriesPerParticle > 1 ){
outputVector[ii].push_back( array->_pSysStream[0][runningIndex].y ); outputVector[orderIndex].push_back( array->_pSysStream[0][ii].y );
} }
if( entriesPerParticle > 2 ){ if( entriesPerParticle > 2 ){
outputVector[ii].push_back( array->_pSysStream[0][runningIndex].z ); outputVector[orderIndex].push_back( array->_pSysStream[0][ii].z );
} }
if( entriesPerParticle > 3 ){ if( entriesPerParticle > 3 ){
outputVector[ii].push_back( array->_pSysStream[0][runningIndex].w ); outputVector[orderIndex].push_back( array->_pSysStream[0][ii].w );
} }
runningIndex++; runningIndex++;
} }
...@@ -4353,8 +4396,8 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){ ...@@ -4353,8 +4396,8 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psVelm4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psVelm4, outputVector, NULL );
/* /*
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector );
......
...@@ -147,9 +147,9 @@ extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, char* fname ...@@ -147,9 +147,9 @@ extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, char* fname
int entriesPerAtom2, CUDAStream<float>* array2 ); int entriesPerAtom2, CUDAStream<float>* array2 );
extern void readFile( std::string fileName, StringVectorVector& fileContents ); extern void readFile( std::string fileName, StringVectorVector& fileContents );
extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector ); extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector, int* order );
extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector ); extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector );
extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector ); extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order );
extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector ); extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear ); extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear );
......
...@@ -947,9 +947,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -947,9 +947,9 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
} }
......
...@@ -562,9 +562,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -562,9 +562,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );
} }
......
...@@ -307,8 +307,8 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -307,8 +307,8 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
......
...@@ -1748,10 +1748,10 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu ) ...@@ -1748,10 +1748,10 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( amoebaGpu->gpuContext->natoms, 3, amoebaGpu->gpuContext->psPosq4, outputVector ); cudaLoadCudaFloat4Array( amoebaGpu->gpuContext->natoms, 3, amoebaGpu->gpuContext->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornRadii, outputVector ); cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornRadii, outputVector, NULL );
cudaLoadCudaFloat2Array( amoebaGpu->gpuContext->natoms, 2, amoebaGpu->gpuContext->psObcData, outputVector ); cudaLoadCudaFloat2Array( amoebaGpu->gpuContext->natoms, 2, amoebaGpu->gpuContext->psObcData, outputVector, NULL );
cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornForce, outputVector ); cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornForce, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaBornForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaBornForce", fileId, outputVector );
(void) fprintf( amoebaGpu->log, "kReduceToBornForcePrefactor: exiting.\n" ); (void) fprintf( amoebaGpu->log, "kReduceToBornForcePrefactor: exiting.\n" );
(void) fprintf( stderr, "kReduceToBornForcePrefactor: exiting.\n" ); (void) fflush( stderr ); (void) fprintf( stderr, "kReduceToBornForcePrefactor: exiting.\n" ); (void) fflush( stderr );
...@@ -2069,8 +2069,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -2069,8 +2069,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
} }
...@@ -2113,8 +2113,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -2113,8 +2113,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodForce, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForce", fileId, outputVector );
} }
......
...@@ -1215,9 +1215,9 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1215,9 +1215,9 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
} }
...@@ -1260,8 +1260,8 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1260,8 +1260,8 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
} }
......
...@@ -829,8 +829,8 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>* ...@@ -829,8 +829,8 @@ void cudaComputeAmoebaMapTorques( amoebaGpuContext amoebaGpu, CUDAStream<float>*
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector );
} }
#endif #endif
...@@ -1031,9 +1031,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu, ...@@ -1031,9 +1031,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloat4Array( gpu->natoms, 4, gpu->psForce4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 4, gpu->psForce4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, psForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, psForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector );
} }
#endif #endif
...@@ -1123,9 +1123,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu, ...@@ -1123,9 +1123,9 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloat4Array( gpu->natoms, 4, gpu->psForce4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 4, gpu->psForce4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL);
cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaVacuumElecForce", fileId, outputVector );
} }
#endif #endif
......
...@@ -925,9 +925,9 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -925,9 +925,9 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaMI_GK", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaMI_GK", fileId, outputVector );
} }
#endif #endif
......
...@@ -594,8 +594,8 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -594,8 +594,8 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
// cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); // cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaMI", fileId, outputVector );
} }
......
...@@ -152,11 +152,12 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir ...@@ -152,11 +152,12 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
} }
__device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float* scalingFactors, float* outputForce, float3 outputTorque[3], float* energy float* scalingFactors, float4 forceTorqueEnergy[3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
,float4* debugArray ,float4* debugArray
#endif #endif
){ ){
float xr = atomJ.x - atomI.x; float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y; float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z; float zr = atomJ.z - atomI.z;
...@@ -210,12 +211,11 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -210,12 +211,11 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// calculate the real space error function terms; // calculate the real space error function terms;
float ralpha = cSim.alphaEwald*r; float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r; float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald; float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 0.0f; float alsq2n = 0.0f;
if( cSim.alphaEwald > 0.0f){ if( cSim.alphaEwald > 0.0f ){
alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald); alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
} }
float exp2a = exp(-(ralpha*ralpha)); float exp2a = exp(-(ralpha*ralpha));
...@@ -263,7 +263,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -263,7 +263,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
if( damp != 0.0f ){ if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole; float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = r/damp; float ratio = r/damp;
damp = -pgamma * ratio*ratio*ratio; damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){ if( damp > -50.0f ){
float expdamp = exp(damp); float expdamp = exp(damp);
scale3 = 1.0f - expdamp; scale3 = 1.0f - expdamp;
...@@ -510,7 +510,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -510,7 +510,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e = e - (1.0f-scalingFactors[MScaleIndex])*erl; e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli; ei = ei - erli;
*energy = -conversionFactor*(e + ei); forceTorqueEnergy[0].w = -conversionFactor*(e + ei);
// increment the total intramolecular energy; assumes; // increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell; // intramolecular distances are less than half of cell;
...@@ -604,6 +604,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -604,6 +604,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
+ (sci3+scip3)*bn2*dk1 + (sci3+scip3)*bn2*dk1
+ gfi4*(qkui1+qkuip1-qiuk1-qiukp1)) + gfi4*(qkui1+qkuip1-qiuk1-qiukp1))
+ gfi5*qir1 + gfi6*qkr1; + gfi5*qir1 + gfi6*qkr1;
float ftm2i2 = gfi1*yr + 0.5f* float ftm2i2 = gfi1*yr + 0.5f*
(gfi2*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1]) (gfi2*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1])
+ bn2*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1]) + bn2*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1])
...@@ -613,6 +614,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -613,6 +614,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
+ (sci3+scip3)*bn2*dk2 + (sci3+scip3)*bn2*dk2
+ gfi4*(qkui2+qkuip2-qiuk2-qiukp2)) + gfi4*(qkui2+qkuip2-qiuk2-qiukp2))
+ gfi5*qir2 + gfi6*qkr2; + gfi5*qir2 + gfi6*qkr2;
float ftm2i3 = gfi1*zr + 0.5f* float ftm2i3 = gfi1*zr + 0.5f*
(gfi2*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2]) (gfi2*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2])
+ bn2*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2]) + bn2*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2])
...@@ -674,10 +676,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -674,10 +676,13 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float temp3 = 0.5f * rr3 * ((gli1+gli6)*scalingFactors[PScaleIndex] float temp3 = 0.5f * rr3 * ((gli1+gli6)*scalingFactors[PScaleIndex]
+(glip1+glip6)*scalingFactors[DScaleIndex]); +(glip1+glip6)*scalingFactors[DScaleIndex]);
float temp5 = 0.5f * rr5 * ((gli2+gli7)*scalingFactors[PScaleIndex] float temp5 = 0.5f * rr5 * ((gli2+gli7)*scalingFactors[PScaleIndex]
+(glip2+glip7)*scalingFactors[DScaleIndex]); +(glip2+glip7)*scalingFactors[DScaleIndex]);
float temp7 = 0.5f * rr7 * (gli3*scalingFactors[PScaleIndex] float temp7 = 0.5f * rr7 * (gli3*scalingFactors[PScaleIndex]
+glip3*scalingFactors[DScaleIndex]); +glip3*scalingFactors[DScaleIndex]);
float fridmp1 = temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71; float fridmp1 = temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71;
float fridmp2 = temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72; float fridmp2 = temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72;
float fridmp3 = temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73; float fridmp3 = temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73;
...@@ -692,9 +697,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -692,9 +697,9 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// modify the forces for partially excluded interactions // modify the forces for partially excluded interactions
ftm2i1 -= fridmp1 - findmp1; ftm2i1 -= (fridmp1 + findmp1);
ftm2i2 -= fridmp2 - findmp2; ftm2i2 -= (fridmp2 + findmp2);
ftm2i3 -= fridmp3 - findmp3; ftm2i3 -= (fridmp3 + findmp3);
// correction to convert mutual to direct polarization force // correction to convert mutual to direct polarization force
...@@ -851,18 +856,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -851,18 +856,18 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// increment gradient due to force and torque on first site; // increment gradient due to force and torque on first site;
outputForce[0] = conversionFactor*(ftm21 + ftm2i1); forceTorqueEnergy[0].x = conversionFactor*(ftm21 + ftm2i1);
outputForce[1] = conversionFactor*(ftm22 + ftm2i2); forceTorqueEnergy[0].y = conversionFactor*(ftm22 + ftm2i2);
outputForce[2] = conversionFactor*(ftm23 + ftm2i3); forceTorqueEnergy[0].z = conversionFactor*(ftm23 + ftm2i3);
conversionFactor *= -1.0; conversionFactor *= -1.0;
outputTorque[0].x = conversionFactor*(ttm21 + ttm2i1); forceTorqueEnergy[1].x = conversionFactor*(ttm21 + ttm2i1);
outputTorque[1].x = conversionFactor*(ttm22 + ttm2i2); forceTorqueEnergy[1].y = conversionFactor*(ttm22 + ttm2i2);
outputTorque[2].x = conversionFactor*(ttm23 + ttm2i3); forceTorqueEnergy[1].z = conversionFactor*(ttm23 + ttm2i3);
outputTorque[1].x = conversionFactor*(ttm31 + ttm3i1); forceTorqueEnergy[2].x = conversionFactor*(ttm31 + ttm3i1);
outputTorque[1].y = conversionFactor*(ttm32 + ttm3i2); forceTorqueEnergy[2].y = conversionFactor*(ttm32 + ttm3i2);
outputTorque[1].z = conversionFactor*(ttm33 + ttm3i3); forceTorqueEnergy[2].z = conversionFactor*(ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int debugIndex = 0; int debugIndex = 0;
...@@ -915,64 +920,103 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros ...@@ -915,64 +920,103 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
debugArray[debugIndex].z = conversionFactor*ftm23; debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker; debugArray[debugIndex].w = idTracker;
debugIndex++; debugIndex++;
*/
idTracker += 1.0; idTracker += 1.0;
debugArray[debugIndex].x = e; debugArray[debugIndex].x = e;
debugArray[debugIndex].y = ei; debugArray[debugIndex].y = ei;
debugArray[debugIndex].z = erl; debugArray[debugIndex].z = erl;
debugArray[debugIndex].w = erli; debugArray[debugIndex].w = erli;
debugIndex++; debugIndex++;
idTracker += 1.0; */
idTracker += 100.0;
debugArray[debugIndex].x = r2; debugArray[debugIndex].x = r2;
debugArray[debugIndex].y = cSim.alphaEwald; debugArray[debugIndex].y = cSim.alphaEwald;
debugArray[debugIndex].z = conversionFactor*fridmp3; debugArray[debugIndex].z = conversionFactor;
debugArray[debugIndex].w = 115.0; debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm21;
debugArray[debugIndex].y = conversionFactor*ftm22;
debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker;
debugIndex++; debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = conversionFactor*findmp1;
debugArray[debugIndex].y = conversionFactor*findmp2;
debugArray[debugIndex].z = conversionFactor*findmp3;
debugArray[debugIndex].w = cSim.alphaEwald + 1.0f;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm2i1;
debugArray[debugIndex].y = conversionFactor*ftm2i2;
debugArray[debugIndex].z = conversionFactor*ftm2i3;
debugArray[debugIndex].w = idTracker;
debugIndex++; debugIndex++;
idTracker += 1.0; /*
idTracker += 100.0;
debugArray[debugIndex].x = fridmp1;
debugArray[debugIndex].y = fridmp2;
debugArray[debugIndex].z = fridmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = findmp1;
debugArray[debugIndex].y = findmp2;
debugArray[debugIndex].z = findmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
*/
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm21; debugArray[debugIndex].x = conversionFactor*ttm21;
debugArray[debugIndex].y = conversionFactor*ttm22; debugArray[debugIndex].y = conversionFactor*ttm22;
debugArray[debugIndex].z = conversionFactor*ttm23; debugArray[debugIndex].z = conversionFactor*ttm23;
debugArray[debugIndex].w = idTracker; debugArray[debugIndex].w = idTracker;
debugIndex++; debugIndex++;
idTracker += 1.0;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm2i1; debugArray[debugIndex].x = conversionFactor*ttm2i1;
debugArray[debugIndex].y = conversionFactor*ttm2i2; debugArray[debugIndex].y = conversionFactor*ttm2i2;
debugArray[debugIndex].z = conversionFactor*ttm2i3; debugArray[debugIndex].z = conversionFactor*ttm2i3;
debugArray[debugIndex].w = idTracker; debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm31;
debugArray[debugIndex].y = conversionFactor*ttm32;
debugArray[debugIndex].z = conversionFactor*ttm33;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm3i1;
debugArray[debugIndex].y = conversionFactor*ttm3i2;
debugArray[debugIndex].z = conversionFactor*ttm3i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
#endif #endif
} else { } else {
outputForce[0] = 0.0f; forceTorqueEnergy[0].x = 0.0f;
outputForce[1] = 0.0f; forceTorqueEnergy[0].y = 0.0f;
outputForce[2] = 0.0f; forceTorqueEnergy[0].z = 0.0f;
outputTorque[0].x = 0.0f; forceTorqueEnergy[1].x = 0.0f;
outputTorque[0].y = 0.0f; forceTorqueEnergy[1].y = 0.0f;
outputTorque[0].z = 0.0f; forceTorqueEnergy[1].z = 0.0f;
outputTorque[1].x = 0.0f; forceTorqueEnergy[2].x = 0.0f;
outputTorque[1].y = 0.0f; forceTorqueEnergy[2].y = 0.0f;
outputTorque[1].z = 0.0f; forceTorqueEnergy[2].z = 0.0f;
*energy = 0.0f; forceTorqueEnergy[0].w = 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
for( int ii = 0; ii < 5; ii++ ){ for( int ii = 0; ii < 12; ii++ ){
debugArray[ii].x = 0.0f; debugArray[ii].x = 0.0f;
debugArray[ii].y = 0.0f; debugArray[ii].y = 0.0f;
debugArray[ii].z = 0.0f; debugArray[ii].z = 0.0f;
debugArray[ii].w = (float) (11*ii); debugArray[ii].w = (float) (-ii);
} }
#endif #endif
...@@ -1047,12 +1091,52 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu ) ...@@ -1047,12 +1091,52 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] ); amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
LAUNCHERROR("kReducePmeDirectElectrostaticForce"); LAUNCHERROR("kReducePmeDirectElectrostaticForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] ); amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque"); LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
} }
/**---------------------------------------------------------------------------------------
Zero gpu->psForce4
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void zeroForce( amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
gpu->psForce4->_pSysStream[0][ii].x = 0.0f;
gpu->psForce4->_pSysStream[0][ii].y = 0.0f;
gpu->psForce4->_pSysStream[0][ii].z = 0.0f;
}
gpu->psForce4->Upload();
}
/**---------------------------------------------------------------------------------------
Copy gpu->psForce4 to amoebaGpu->psForce
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
{
gpuContext gpu = amoebaGpu->gpuContext;
gpu->psForce4->Download();
int indexOffset = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
amoebaGpu->psForce->_pSysStream[0][indexOffset] = conversion*(gpu->psForce4->_pSysStream[0][ii].x);
amoebaGpu->psForce->_pSysStream[0][indexOffset+1] = conversion*(gpu->psForce4->_pSysStream[0][ii].y);
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] = conversion*(gpu->psForce4->_pSysStream[0][ii].z);
indexOffset += 3;
}
amoebaGpu->psForce->Upload();
}
//#define GET_INDUCED_DIPOLE_FROM_FILE //#define GET_INDUCED_DIPOLE_FROM_FILE
#ifdef GET_INDUCED_DIPOLE_FROM_FILE #ifdef GET_INDUCED_DIPOLE_FROM_FILE
#include <stdlib.h> #include <stdlib.h>
...@@ -1098,11 +1182,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1098,11 +1182,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
unsigned int targetAtom = 10; unsigned int targetAtom = 49;
#endif #endif
#ifdef GET_INDUCED_DIPOLE_FROM_FILE #ifdef GET_INDUCED_DIPOLE_FROM_FILE
std::string fileName = "waterInducedDipole.txt"; //std::string fileName = "waterInducedDipole.txt";
std::string fileName = "water_3_MI.txt";
StringVectorVector fileContents; StringVectorVector fileContents;
readFile( fileName, fileContents ); readFile( fileName, fileContents );
unsigned int offset = 0; unsigned int offset = 0;
...@@ -1152,7 +1237,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1152,7 +1237,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)),
sizeof(PmeDirectElectrostaticParticle) ); sizeof(PmeDirectElectrostaticParticle) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock, sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block ); amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
...@@ -1173,9 +1258,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1173,9 +1258,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else { } else {
// gpu->sim.pInteractingWorkUnit,
// amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
...@@ -1196,11 +1278,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1196,11 +1278,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
amoebaGpu->psTorque->Download(); amoebaGpu->psTorque->Download();
debugArray->Download(); debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 5; int maxPrint = 5;
float conversion = 1.0f/41.84f; float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f}; float forceSum[3] = { 0.0f, 0.0f, 0.0f};
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution conversion=%.5f\n", conversion ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
...@@ -1230,6 +1313,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1230,6 +1313,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
/*
gpu->psEnergy->Download(); gpu->psEnergy->Download();
double energy = 0.0; double energy = 0.0;
for( unsigned int ii = 0; ii < gpu->sim.energyOutputBuffers; ii++ ){ for( unsigned int ii = 0; ii < gpu->sim.energyOutputBuffers; ii++ ){
...@@ -1240,36 +1324,41 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1240,36 +1324,41 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
} }
(void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy ); (void) fprintf( amoebaGpu->log,"Force sums: [%16.9e %16.9e %16.9e] Energy=%16.9e\n", forceSum[0], forceSum[1], forceSum[2], energy );
*/
if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElecAll\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms*gpu->natoms; jj++ ){
if( fabs( debugArray->_pSysStream[0][jj].w - 111.0 ) < 1.0e-04 ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%8d [%16.9e %16.9e %16.9e %16.9e] Enr11\n", jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"%8d [%16.9e %16.9e %16.9e %16.9e] Enr12\n", jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
}
}
}
(void) fprintf( amoebaGpu->log,"\n" );
if( 0 ){ if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" ); (void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
float torqueSum0[3] = { 0.0f, 0.0f, 0.0f};
float torqueSum1[3] = { 0.0f, 0.0f, 0.0f};
int offset0 = 7*paddedNumberOfAtoms;
int offset1 = 8*paddedNumberOfAtoms;
int offset2 = 7*paddedNumberOfAtoms;
int offset3 = 8*paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj; int debugIndex = jj;
for( int kk = 0; kk < 6; kk++ ){ if( fabs( debugArray->_pSysStream[0][debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, if( jj != targetAtom ){
torqueSum0[0] += debugArray->_pSysStream[0][debugIndex+offset0].x + debugArray->_pSysStream[0][debugIndex+offset1].x;
torqueSum0[1] += debugArray->_pSysStream[0][debugIndex+offset0].y + debugArray->_pSysStream[0][debugIndex+offset1].y;
torqueSum0[2] += debugArray->_pSysStream[0][debugIndex+offset0].z + debugArray->_pSysStream[0][debugIndex+offset1].z;
torqueSum1[0] += debugArray->_pSysStream[0][debugIndex+offset2].x + debugArray->_pSysStream[0][debugIndex+offset3].x;
torqueSum1[1] += debugArray->_pSysStream[0][debugIndex+offset2].y + debugArray->_pSysStream[0][debugIndex+offset3].y;
torqueSum1[2] += debugArray->_pSysStream[0][debugIndex+offset2].z + debugArray->_pSysStream[0][debugIndex+offset3].z;
}
if( jj == 2 ){
offset0 += 2*paddedNumberOfAtoms;
offset1 += 2*paddedNumberOfAtoms;
}
for( int kk = 0; kk < 12; kk++ ){
(void) fprintf( amoebaGpu->log,"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, kk,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w ); debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms; debugIndex += paddedNumberOfAtoms;
} }
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum\n", targetAtom, jj,
torqueSum0[0], torqueSum0[1], torqueSum0[2], torqueSum1[0], torqueSum1[1], torqueSum1[2]);
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
} }
} }
...@@ -1279,9 +1368,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1279,9 +1368,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, gpu->psAtomIndex->_pSysData);
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
} }
...@@ -1291,8 +1380,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1291,8 +1380,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, gpu->psForce4 ); cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, gpu->psForce4 );
} if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
copyForce( amoebaGpu, -1.0f/41.84f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForce", fileId, outputVector );
}
}
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute Amoeba electrostatic force & torque using PME Compute Amoeba electrostatic force & torque using PME
...@@ -1303,6 +1400,55 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1303,6 +1400,55 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{ {
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector );
zeroForce( amoebaGpu );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "yCudaPmeDirectForce", fileId, outputVector );
zeroForce( amoebaGpu );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeForce", fileId, outputVector );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePmeForce", fileId, outputVector );
}
//zeroForce( amoebaGpu );
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu ); cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu ); kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
} }
......
...@@ -43,7 +43,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -43,7 +43,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
){ ){
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
int maxPullIndex = 2; int maxPullIndex = 7;
float4 pullBack[12]; float4 pullBack[12];
#endif #endif
...@@ -56,6 +56,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -56,6 +56,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float totalEnergy = 0.0f; float totalEnergy = 0.0f;
float4 forceTorqueEnergy[3];
float scalingFactors[LastScalingIndex]; float scalingFactors[LastScalingIndex];
...@@ -90,10 +91,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -90,10 +91,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
localParticle.torque[1] = 0.0f; localParticle.torque[1] = 0.0f;
localParticle.torque[2] = 0.0f; localParticle.torque[2] = 0.0f;
scalingFactors[PScaleIndex] = 1.0f;
scalingFactors[DScaleIndex] = 1.0f;
scalingFactors[UScaleIndex] = 1.0f; scalingFactors[UScaleIndex] = 1.0f;
scalingFactors[MScaleIndex] = 1.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
...@@ -116,9 +114,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -116,9 +114,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
float force[3];
float3 torque[2];
unsigned int atomJ = y + j; unsigned int atomJ = y + j;
// set scale factors // set scale factors
...@@ -132,9 +127,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -132,9 +127,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// force // force
float energy; calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j], scalingFactors, forceTorqueEnergy
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j],
scalingFactors, force, torque, &energy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
...@@ -143,28 +136,25 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)( ...@@ -143,28 +136,25 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag // by setting match flag
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; if( (atomI != atomJ) && (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
{
// add to field at atomI the field due atomJ's charge/dipole/quadrupole localParticle.force[0] += forceTorqueEnergy[0].x;
localParticle.force[1] += forceTorqueEnergy[0].y;
localParticle.force[2] += forceTorqueEnergy[0].z;
localParticle.force[0] += mask ? force[0] : 0.0f; localParticle.torque[0] += forceTorqueEnergy[1].x;
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.torque[1] += forceTorqueEnergy[1].y;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.torque[2] += forceTorqueEnergy[1].z;
localParticle.torque[0] += mask ? torque[0].x : 0.0f; // energy for each diagonal-block ixn included twice, hence factor of 0.5
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f; totalEnergy += 0.5*forceTorqueEnergy[0].w;
}
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
/* if( atomI == targetAtom || atomJ == targetAtom ){
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if( atomI == targetAtom ){ unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int index = (atomI == targetAtom) ? atomJ : atomI; unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
float blockId = 1.0f; float blockId = 1.0f;
...@@ -174,23 +164,24 @@ if( atomI == targetAtom ){ ...@@ -174,23 +164,24 @@ if( atomI == targetAtom ){
debugArray[index].w = blockId; debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? force[0] : 0.0f; debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? force[1] : 0.0f; debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? force[2] : 0.0f; debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = blockId; debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0].x : 0.0f; debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f; debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f; debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
debugArray[index].w = mask ? energy : 0.0f; float offsetF = (float)(3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0].x : 0.0f; debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? torque[0].y : 0.0f; debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? torque[0].z : 0.0f; debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x); debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){ for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
...@@ -290,14 +281,11 @@ if( atomI == targetAtom ){ ...@@ -290,14 +281,11 @@ if( atomI == targetAtom ){
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
if ((flags&(1<<j)) != 0) if( (flags & (1<<j) ) != 0)
{ {
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j; unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx; unsigned int atomJ = y + jIdx;
float force[3];
float3 torque[2];
// set scale factors // set scale factors
if( bExclusionFlag ) if( bExclusionFlag )
...@@ -309,9 +297,8 @@ if( atomI == targetAtom ){ ...@@ -309,9 +297,8 @@ if( atomI == targetAtom ){
// force // force
float energy;
calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx], calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
scalingFactors, force, torque, &energy scalingFactors, forceTorqueEnergy
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, pullBack , pullBack
#endif #endif
...@@ -319,41 +306,41 @@ if( atomI == targetAtom ){ ...@@ -319,41 +306,41 @@ if( atomI == targetAtom ){
// check if atoms out-of-bounds // check if atoms out-of-bounds
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; if( (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
{
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f; localParticle.force[0] += forceTorqueEnergy[0].x;
localParticle.force[1] += mask ? force[1] : 0.0f; localParticle.force[1] += forceTorqueEnergy[0].y;
localParticle.force[2] += mask ? force[2] : 0.0f; localParticle.force[2] += forceTorqueEnergy[0].z;
localParticle.torque[0] += mask ? torque[0].x : 0.0f; totalEnergy += forceTorqueEnergy[0].w;
localParticle.torque[1] += mask ? torque[0].y : 0.0f;
localParticle.torque[2] += mask ? torque[0].z : 0.0f;
totalEnergy += mask ? energy : 0.0f; localParticle.torque[0] += forceTorqueEnergy[1].x;
localParticle.torque[1] += forceTorqueEnergy[1].y;
localParticle.torque[2] += forceTorqueEnergy[1].z;
// add force and torque to atom J due atom I // add force and torque to atom J due atom I
if( flags == 0xFFFFFFFF ){ if( flags == 0xFFFFFFFF ){
psA[jIdx].force[0] -= mask ? force[0] : 0.0f; psA[jIdx].force[0] -= forceTorqueEnergy[0].x;
psA[jIdx].force[1] -= mask ? force[1] : 0.0f; psA[jIdx].force[1] -= forceTorqueEnergy[0].y;
psA[jIdx].force[2] -= mask ? force[2] : 0.0f; psA[jIdx].force[2] -= forceTorqueEnergy[0].z;
psA[jIdx].torque[0] += mask ? torque[1].x : 0.0f; psA[jIdx].torque[0] += forceTorqueEnergy[2].x;
psA[jIdx].torque[1] += mask ? torque[1].y : 0.0f; psA[jIdx].torque[1] += forceTorqueEnergy[2].y;
psA[jIdx].torque[2] += mask ? torque[1].z : 0.0f; psA[jIdx].torque[2] += forceTorqueEnergy[2].z;
} else { } else {
sA[threadIdx.x].tempForce[0] = mask ? 0.0f : force[0]; sA[threadIdx.x].tempForce[0] = forceTorqueEnergy[0].x;
sA[threadIdx.x].tempForce[1] = mask ? 0.0f : force[1]; sA[threadIdx.x].tempForce[1] = forceTorqueEnergy[1].y;
sA[threadIdx.x].tempForce[2] = mask ? 0.0f : force[2]; sA[threadIdx.x].tempForce[2] = forceTorqueEnergy[2].z;
sA[threadIdx.x].tempTorque[0] = mask ? 0.0f : torque[1].x; sA[threadIdx.x].tempTorque[0] = forceTorqueEnergy[2].x;
sA[threadIdx.x].tempTorque[1] = mask ? 0.0f : torque[1].y; sA[threadIdx.x].tempTorque[1] = forceTorqueEnergy[2].y;
sA[threadIdx.x].tempTorque[2] = mask ? 0.0f : torque[1].z; sA[threadIdx.x].tempTorque[2] = forceTorqueEnergy[2].z;
if( tgx % 2 == 0 ){ if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
...@@ -379,15 +366,58 @@ if( atomI == targetAtom ){ ...@@ -379,15 +366,58 @@ if( atomI == targetAtom ){
psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2]; psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
} }
} }
} } // end of atoms out-of-bounds
} // end of flags&(1<<j block
#ifdef AMOEBA_DEBUG
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) y;
debugArray[index].w = (flags == 0xFFFFFFFF) ? (float) -141.0f : -151.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
float offsetF = (float)(3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
offsetF = (float) (3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} // end of j-loop } // end of j-loop
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
float of; float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
...@@ -441,7 +471,7 @@ if( atomI == targetAtom ){ ...@@ -441,7 +471,7 @@ if( atomI == targetAtom ){
of += sA[threadIdx.x].torque[2]; of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of; outputTorque[offset+2] = of;
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0]; outputForce[offset] = localParticle.force[0];
...@@ -462,7 +492,7 @@ if( atomI == targetAtom ){ ...@@ -462,7 +492,7 @@ if( atomI == targetAtom ){
outputTorque[offset+1] = sA[threadIdx.x].torque[1]; outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2]; outputTorque[offset+2] = sA[threadIdx.x].torque[2];
#endif #endif
lasty = y; lasty = y;
} // end of pInteractionFlag block } // end of pInteractionFlag block
......
...@@ -565,9 +565,9 @@ if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){ ...@@ -565,9 +565,9 @@ if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
delete debugArray; delete debugArray;
...@@ -578,9 +578,9 @@ if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){ ...@@ -578,9 +578,9 @@ if( fabs(debugArray->_pSysStream[0][jj+3*paddedNumberOfAtoms].x) > 0.0 ){
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( 0 ); fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
...@@ -590,4 +590,15 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -590,4 +590,15 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
{ {
kCalculateAmoebaPMEFixedMultipoles( amoebaGpu ); kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu ); cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
} }
...@@ -193,6 +193,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -193,6 +193,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
fields[2].z = 0.0f; fields[2].z = 0.0f;
fields[2].w = 0.0f; fields[2].w = 0.0f;
} }
/*
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
pullBack[0].x = xr; pullBack[0].x = xr;
pullBack[0].y = yr; pullBack[0].y = yr;
...@@ -204,7 +205,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -204,7 +205,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
pullBack[1].z = bn2; pullBack[1].z = bn2;
pullBack[1].w = exp2a; pullBack[1].w = exp2a;
/*
pullBack[1].x = atomJ.x - atomI.x; pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y; pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z; pullBack[1].z = atomJ.z - atomI.z;
...@@ -212,8 +212,8 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -212,8 +212,8 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
pullBack[1].x = scale3; pullBack[1].x = scale3;
pullBack[1].y = scale5; pullBack[1].y = scale5;
pullBack[1].z = scale7; pullBack[1].z = scale7;
*/
#endif #endif
*/
} }
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
...@@ -385,8 +385,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -385,8 +385,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply"; static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1; static int iteration = 1;
if( 1 && amoebaGpu->log ){ if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n", (void) fprintf( amoebaGpu->log, "%s\n", methodName );
methodName, amoebaGpu->scalingDistanceCutoff );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
...@@ -748,9 +747,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -748,9 +747,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( iteration ); fileId.push_back( iteration );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
} }
...@@ -780,9 +779,9 @@ fflush( amoebaGpu->log ); ...@@ -780,9 +779,9 @@ fflush( amoebaGpu->log );
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
} }
......
...@@ -122,6 +122,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -122,6 +122,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
fieldPolarSum[1] += mask ? ijField[1].z : 0.0f; fieldPolarSum[1] += mask ? ijField[1].z : 0.0f;
fieldPolarSum[2] += mask ? ijField[2].z : 0.0f; fieldPolarSum[2] += mask ? ijField[2].z : 0.0f;
/*
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){ if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI; unsigned int index = atomI == targetAtom ? (y+j) : atomI;
...@@ -173,7 +174,6 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -173,7 +174,6 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].z = ijField[indexJ+1][2]; debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag; debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms; index += cAmoebaSim.paddedNumberOfAtoms;
...@@ -189,10 +189,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -189,10 +189,10 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f; debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f; debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f; debugArray[index].w = + 10.0f;
*/
} }
#endif #endif
*/
} }
// Write results // Write results
...@@ -308,6 +308,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){ ...@@ -308,6 +308,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
} }
/*
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI; unsigned int index = atomI == targetAtom ? (y+jIdx) : atomI;
...@@ -360,6 +361,7 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){ ...@@ -360,6 +361,7 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].w = flag; debugArray[index].w = flag;
} }
#endif #endif
*/
} }
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
......
...@@ -528,8 +528,8 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -528,8 +528,8 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psRotationMatrix, outputVector ); cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psRotationMatrix, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRotationMatrices", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaRotationMatrices", fileId, outputVector );
} }
if( 0 ){ if( 0 ){
...@@ -539,9 +539,9 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -539,9 +539,9 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( particles, 3, amoebaGpu->psLabFrameDipole, outputVector ); cudaLoadCudaFloatArray( particles, 3, amoebaGpu->psLabFrameDipole, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psLabFrameQuadrupole, outputVector ); cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRotatedMoments", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaRotatedMoments", fileId, outputVector );
} }
......
...@@ -715,8 +715,8 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -715,8 +715,8 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloat4Array( gpu->natoms, 3, psTempForce, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, psTempForce, outputVector, gpu->psAtomIndex->_pSysData );
cudaWriteVectorOfDoubleVectorsToFile( "CudaVdw", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaVdw", fileId, outputVector );
delete psTempForce; delete psTempForce;
//exit(0); //exit(0);
......
...@@ -631,8 +631,8 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu ) ...@@ -631,8 +631,8 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL );
cudaLoadCudaFloatArray( gpu->natoms, 3, psTempForce, outputVector ); cudaLoadCudaFloatArray( gpu->natoms, 3, psTempForce, outputVector, NULL );
cudaWriteVectorOfDoubleVectorsToFile( "CudaWca", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaWca", fileId, outputVector );
delete psTempForce; delete psTempForce;
//exit(0); //exit(0);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment