Commit 132a94bc authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Warp/non-warp calls were reversed in kCalculateAmoebaCudaKirkwood

parent 07f8d5ce
...@@ -793,6 +793,8 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { ...@@ -793,6 +793,8 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
if( data.getHasAmoebaGeneralizedKirkwood() ){ if( data.getHasAmoebaGeneralizedKirkwood() ){
kCalculateObcGbsaBornSum(gpu->gpuContext); kCalculateObcGbsaBornSum(gpu->gpuContext);
kReduceObcGbsaBornSum(gpu->gpuContext); kReduceObcGbsaBornSum(gpu->gpuContext);
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psBornRadii, 0.1 );
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psObcChain, 0.0 );
} }
// multipoles // multipoles
...@@ -801,7 +803,6 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { ...@@ -801,7 +803,6 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
//kClearForces(gpu->gpuContext); //kClearForces(gpu->gpuContext);
//kClearEnergy(gpu->gpuContext); //kClearEnergy(gpu->gpuContext);
//(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce clearing forces/energy after kCalculateAmoebaMultipoleForces()\n" );
// GK // GK
......
...@@ -191,12 +191,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -191,12 +191,10 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, "\n\n" ); (void) fprintf( log, "\n\n" );
(void) fprintf( log, " gpuContext %p\n", amoebaGpu->gpuContext ); (void) fprintf( log, " gpuContext %p\n", amoebaGpu->gpuContext );
(void) fprintf( log, " log %p\n", amoebaGpu->log ); (void) fprintf( log, " log %p %s\n", amoebaGpu->log, amoebaGpu->log == stderr ? "is stderr" : "is not stderr");
(void) fprintf( log, " sm_version %u\n", gpu->sm_version ); (void) fprintf( log, " sm_version %u\n", gpu->sm_version );
(void) fprintf( log, " device %u\n", gpu->device ); (void) fprintf( log, " device %u\n", gpu->device );
(void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock ); (void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock );
(void) fprintf( log, " pMapArray %p\n", amoebaGpu->pMapArray );
(void) fprintf( log, " dMapArray %p\n", amoebaGpu->dMapArray );
(void) fprintf( log, " bOutputBufferPerWarp %d\n", amoebaGpu->bOutputBufferPerWarp ); (void) fprintf( log, " bOutputBufferPerWarp %d\n", amoebaGpu->bOutputBufferPerWarp );
(void) fprintf( log, " paddedNumberOfAtoms %u\n", amoebaGpu->paddedNumberOfAtoms ); (void) fprintf( log, " paddedNumberOfAtoms %u\n", amoebaGpu->paddedNumberOfAtoms );
(void) fprintf( log, " nonbondBlocks %u\n", amoebaGpu->nonbondBlocks ); (void) fprintf( log, " nonbondBlocks %u\n", amoebaGpu->nonbondBlocks );
...@@ -209,6 +207,13 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -209,6 +207,13 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " outputBuffers %u\n", amoebaGpu->outputBuffers ); (void) fprintf( log, " outputBuffers %u\n", amoebaGpu->outputBuffers );
(void) fprintf( log, " workUnits %u\n", amoebaGpu->workUnits ); (void) fprintf( log, " workUnits %u\n", amoebaGpu->workUnits );
gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psEnergy, log );
gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psForce4, log );
gpuPrintCudaStreamFloat4( amoebaGpu->gpuContext->psPosq4, log );
gpuPrintCudaStreamFloat2( amoebaGpu->gpuContext->psObcData, log );
gpuPrintCudaStreamFloat( amoebaGpu->gpuContext->psBornForce, log );
(void) fprintf( log, "\n\n" );
(void) fprintf( log, " amoebaBonds %u\n", amoebaGpu->amoebaSim.amoebaBonds );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_1, log ); gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_1, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_2, log ); gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_2, log );
gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_3, log ); gpuPrintCudaStreamFloat( amoebaGpu->psWorkArray_3_3, log );
...@@ -337,6 +342,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -337,6 +342,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " quartic %15.7e\n", amoebaGpu->amoebaSim.amoebaUreyBradleyQuarticicParameter); (void) fprintf( log, " quartic %15.7e\n", amoebaGpu->amoebaSim.amoebaUreyBradleyQuarticicParameter);
(void) fprintf( log, " pAmoebaUreyBradleyID %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyID ); (void) fprintf( log, " pAmoebaUreyBradleyID %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyID );
(void) fprintf( log, " pAmoebaUreyBradleyParameter %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter ); (void) fprintf( log, " pAmoebaUreyBradleyParameter %p\n", amoebaGpu->amoebaSim.pAmoebaUreyBradleyParameter );
(void) fprintf( log, "\n\n" );
// if( amoebaGpu->psRotationMatrix)(void) fprintf( log, "\n" ); // if( amoebaGpu->psRotationMatrix)(void) fprintf( log, "\n" );
// gpuPrintCudaStreamFloat( amoebaGpu->psRotationMatrix, log ); // gpuPrintCudaStreamFloat( amoebaGpu->psRotationMatrix, log );
...@@ -394,7 +400,6 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -394,7 +400,6 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log ); gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipole, log );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log ); gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
gpuPrintCudaStreamFloat( amoebaGpu->psInducedDipolePolar, log );
gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log ); gpuPrintCudaStreamFloat( amoebaGpu->psCurrentEpsilon, log );
(void) fprintf( log, " numberOfSorWorkVectors %u\n", amoebaGpu->numberOfSorWorkVectors); (void) fprintf( log, " numberOfSorWorkVectors %u\n", amoebaGpu->numberOfSorWorkVectors);
...@@ -4437,3 +4442,32 @@ void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){ ...@@ -4437,3 +4442,32 @@ void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){
} }
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
/**---------------------------------------------------------------------------------------
Load contents of arrays into vector
@param numberOfParticles number of particles
@param entriesPerParticle entries/particles array
@param array cuda array
@param initValue vector init value
--------------------------------------------------------------------------------------- */
void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle,
CUDAStream<float>* array, float initValue )
{
// ---------------------------------------------------------------------------------------
// static const std::string methodName = "initializeCudaFloatArray";
// ---------------------------------------------------------------------------------------
for( int ii = 0; ii < numberOfParticles; ii++ ){
for( int jj = 0; jj < entriesPerParticle; jj++ ) {
array->_pSysStream[0][entriesPerParticle*ii+jj] = initValue;
}
}
array->Upload();
}
...@@ -151,6 +151,8 @@ extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticl ...@@ -151,6 +151,8 @@ extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticl
extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector ); extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector );
extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order ); extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order );
extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector ); extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear ); extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear );
extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear ); extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );
......
...@@ -1907,7 +1907,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1907,7 +1907,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 6 ); kClearFields_3( amoebaGpu, 6 );
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0] amoebaGpu->psWorkUnit->_pDevStream[0]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0], targetAtom ); , debugArray->_pDevStream[0], targetAtom );
...@@ -1924,7 +1924,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1924,7 +1924,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
#endif #endif
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0] amoebaGpu->psWorkUnit->_pDevStream[0]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0], targetAtom ); , debugArray->_pDevStream[0], targetAtom );
......
...@@ -1059,6 +1059,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1059,6 +1059,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle)), maxThreads);
} }
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && timestep == 1 ){ if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu" (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" Ebuf=%u ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n", " Ebuf=%u ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
...@@ -1068,6 +1069,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1068,6 +1069,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
//gpuPrintCudaAmoebaGmxSimulation(amoebaGpu, amoebaGpu->log ); //gpuPrintCudaAmoebaGmxSimulation(amoebaGpu, amoebaGpu->log );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
......
...@@ -502,8 +502,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -502,8 +502,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
int targetAtom = 0; int targetAtom = 0;
static const char* methodName = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply"; static const char* methodName = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
if( 1 && amoebaGpu->log ){ if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n", (void) fprintf( amoebaGpu->log, "%s\n", methodName );
methodName, amoebaGpu->scalingDistanceCutoff );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
...@@ -583,7 +582,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -583,7 +582,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
//printMiFieldBuffer( amoebaGpu, 37 ); //printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 ); //printMiFieldBuffer( amoebaGpu, 38 );
if( amoebaGpu->log && iteration == -1 ){ if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
......
...@@ -1390,6 +1390,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1390,6 +1390,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} }
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute Amoeba electrostatic force & torque using PME Compute Amoeba electrostatic force & torque using PME
...@@ -1413,7 +1414,7 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1413,7 +1414,7 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
zeroForce( amoebaGpu ); zeroForce( amoebaGpu );
} }
if( 1 ){ if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId; std::vector<int> fileId;
......
...@@ -565,7 +565,23 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -565,7 +565,23 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
if( hasAmoebaGeneralizedKirkwood ){ if( hasAmoebaGeneralizedKirkwood ){
cudaComputeAmoebaFixedEAndGkFields( amoebaGpu ); cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, 0.0 );
}
cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu ); cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipoleS, 0.0 );
initializeCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolarS, 0.0 );
amoebaGpu->mutualInducedDone = 1;
}
} else { } else {
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaFixedEField( amoebaGpu ); cudaComputeAmoebaFixedEField( amoebaGpu );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment