Commit f01dc8d9 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Bug fix for MI calculation

parent 35a4b156
...@@ -445,9 +445,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -445,9 +445,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
static int iteration = 1;
int targetAtom = 546; int targetAtom = 546;
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply"; static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){ if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName ); (void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -485,30 +485,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -485,30 +485,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else { } else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField"); LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
...@@ -520,8 +507,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -520,8 +507,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
iteration ); (void) fflush( amoebaGpu->log ); iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download(); outputArray->Download();
outputPolarArray->Download(); outputPolarArray->Download();
debugArray->Download(); // debugArray->Download();
int maxPrint = 5; int maxPrint = 5000000;
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii); (void) fprintf( amoebaGpu->log, "%5d ", ii);
...@@ -564,7 +551,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -564,7 +551,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
iteration++; iteration++;
} }
delete debugArray; // delete debugArray;
#endif #endif
} }
...@@ -652,6 +639,18 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -652,6 +639,18 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// post matrix multiply // post matrix multiply
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMIPre", fileId, outputVector );
}
kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
...@@ -664,11 +663,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -664,11 +663,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
std::vector<int> fileId; std::vector<int> fileId;
fileId.push_back( iteration ); fileId.push_back( iteration );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
// cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
//exit(0);
} }
// get total epsilon -- performing sums on gpu // get total epsilon -- performing sums on gpu
...@@ -762,6 +763,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -762,6 +763,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pSysData[2], done ); amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log );
}
#endif #endif
// exit if nan // exit if nan
...@@ -781,7 +790,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -781,7 +790,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, 1.0f ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
......
...@@ -104,13 +104,12 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -104,13 +104,12 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
float4 delta; float4 delta;
float prefactor2; float prefactor2;
if( ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ){ if( ( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 ); setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
//delta.w = prefactor2 = 0.0f;
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
} }
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
} }
...@@ -118,24 +117,18 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -118,24 +117,18 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
#else #else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
#endif
load3dArray( offset, fieldSum, outputField ); load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar); load3dArray( offset, fieldPolarSum, outputFieldPolar);
#endif
} else { } else {
if (lasty != y) if (lasty != y)
{ {
unsigned int atomJ = y + tgx; unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ ); loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
} }
...@@ -152,58 +145,53 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -152,58 +145,53 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) ); zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++){
{ if ((flags&(1<<j)) != 0) {
if ((flags&(1<<j)) != 0)
{
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j; unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
float4 delta; float4 delta;
float prefactor2; float prefactor2;
if( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ){ if( (atomI < cSim.atoms) && ((y+jIdx) < cSim.atoms) ){
delta.w = prefactor2 = 0.0f;
} else {
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, &delta, &prefactor2 ); setupMutualInducedFieldPairIxn_kernel( localParticle, psA[jIdx], uscale, &delta, &prefactor2 );
} calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipole, delta, prefactor2, fieldSum ); calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
calculateMutualInducedFieldPairIxn_kernel( psA[jIdx].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
#ifndef INCLUDE_MI_FIELD_BUFFERS #ifndef INCLUDE_MI_FIELD_BUFFERS
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
#else
if( flags == 0xFFFFFFFF ){
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field ); calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar ); calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
} else { #else
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipole, delta, prefactor2, sA[threadIdx.x].tempBuffer ); if( flags == 0xFFFFFFFF ){
calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipolePolar, delta, prefactor2, sA[threadIdx.x].tempBufferP ); calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipole, delta, prefactor2, psA[jIdx].field );
calculateMutualInducedFieldPairIxn_kernel( localParticle.inducedDipolePolar, delta, prefactor2, psA[jIdx].fieldPolar );
if( tgx % 2 == 0 ){ } else {
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] ); calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipole, delta, prefactor2, sA[threadIdx.x].tempBuffer );
} calculateMutualInducedFieldPairIxnNoAdd_kernel( localParticle.inducedDipolePolar, delta, prefactor2, sA[threadIdx.x].tempBufferP );
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] ); if( tgx % 2 == 0 ){
} sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
if( tgx % 8 == 0 ){ }
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] ); if( tgx % 4 == 0 ){
} sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
if( tgx % 16 == 0 ){ }
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] ); if( tgx % 8 == 0 ){
} sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if (tgx == 0) if( tgx % 16 == 0 ){
{ sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0]; }
psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2]; if (tgx == 0)
{
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0]; psA[jIdx].field[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1]; psA[jIdx].field[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2]; psA[jIdx].field[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].fieldPolar[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].fieldPolar[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].fieldPolar[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
} }
}
#endif #endif
}
} }
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment