Commit 1aff5bcb authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Torques now mapped to forces only once per force calculation

parent 101f206d
...@@ -793,17 +793,12 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) { ...@@ -793,17 +793,12 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
if( data.getHasAmoebaGeneralizedKirkwood() ){ if( data.getHasAmoebaGeneralizedKirkwood() ){
kCalculateObcGbsaBornSum(gpu->gpuContext); kCalculateObcGbsaBornSum(gpu->gpuContext);
kReduceObcGbsaBornSum(gpu->gpuContext); kReduceObcGbsaBornSum(gpu->gpuContext);
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psBornRadii, 0.1 );
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psObcChain, 0.0 );
} }
// multipoles // multipoles
kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() ); kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
//kClearForces(gpu->gpuContext);
//kClearEnergy(gpu->gpuContext);
// GK // GK
if( data.getHasAmoebaGeneralizedKirkwood() ){ if( data.getHasAmoebaGeneralizedKirkwood() ){
......
...@@ -104,7 +104,7 @@ extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std:: ...@@ -104,7 +104,7 @@ extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::
extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu ); extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu ); extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ); extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce );
extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu ); extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu ); extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
......
...@@ -705,7 +705,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ) ...@@ -705,7 +705,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
LAUNCHERROR("kReduceElectrostaticTorque"); LAUNCHERROR("kReduceElectrostaticTorque");
} }
...@@ -718,7 +718,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ) ...@@ -718,7 +718,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
...@@ -768,7 +768,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -768,7 +768,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
} }
kClearFields_3( amoebaGpu, 1 ); kClearFields_3( amoebaGpu, 1 );
LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
...@@ -812,117 +811,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -812,117 +811,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
} }
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
kReduceTorque( amoebaGpu ); if( addTorqueToForce ){
LAUNCHERROR("kReduceForceTorque"); kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque ); }
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psTorque->Download();
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished Electrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// torque
(void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
for( int kk = 0; kk < 8; kk++ ){
float conversion = kk >= 1 && kk <= 8 ? 1.0f/4.184f : 1.0;
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
conversion*debugArray->_pSysData[debugIndex].x, conversion*debugArray->_pSysData[debugIndex].y,
conversion*debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
if( 1 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex1 = jj + paddedNumberOfAtoms;
int debugIndex2 = jj + 5*paddedNumberOfAtoms;
int debugIndex3 = jj + 6*paddedNumberOfAtoms;
int debugIndex4 = jj + 4*paddedNumberOfAtoms;
int debugIndex5 = jj + 7*paddedNumberOfAtoms;
float conversion = 1.0f/4.184f;
int i1,i2;
if( jj < targetAtom ){
i1 = jj;
i2 = targetAtom;
} else {
i1 = targetAtom;
i2 = jj;
}
(void) fprintf( amoebaGpu->log,"%5d %5d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e F11\n", i1,i2,
conversion*debugArray->_pSysData[debugIndex1].x,
conversion*debugArray->_pSysData[debugIndex1].y,
conversion*debugArray->_pSysData[debugIndex1].z,
conversion*debugArray->_pSysData[debugIndex2].x,
conversion*debugArray->_pSysData[debugIndex2].y,
conversion*debugArray->_pSysData[debugIndex2].z,
conversion*debugArray->_pSysData[debugIndex3].x,
conversion*debugArray->_pSysData[debugIndex3].y,
conversion*debugArray->_pSysData[debugIndex3].z,
conversion*debugArray->_pSysData[debugIndex5].x,
conversion*debugArray->_pSysData[debugIndex5].y,
conversion*debugArray->_pSysData[debugIndex5].z );
}
}
(void) fflush( amoebaGpu->log );
if( 0 ){
(void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
int maxPrint = 12;
for( int ii = 0; ii < gpu->natoms; ii++ ){
// print cpu & gpu reductions
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
if( 0 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
......
...@@ -41,17 +41,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu ) ...@@ -41,17 +41,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields1"); LAUNCHERROR("kReduceEAndGK_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields2"); LAUNCHERROR("kReduceEAndGK_Fields2");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData ); amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields3"); LAUNCHERROR("kReduceEAndGK_Fields3");
} }
......
...@@ -39,12 +39,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu ) ...@@ -39,12 +39,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
LAUNCHERROR("kReduceE_Fields1"); LAUNCHERROR("kReduceE_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
LAUNCHERROR("kReduceE_Fields2"); LAUNCHERROR("kReduceE_Fields2");
} }
......
...@@ -1531,19 +1531,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA ) ...@@ -1531,19 +1531,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
#define METHOD_NAME(a, b) a##N2ByWarp##b #define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaKirkwood.h" #include "kCalculateAmoebaCudaKirkwood.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
static void kReduceTorque(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceTorque");
}
// reduce psWorkArray_1_1 -> dBorn // reduce psWorkArray_1_1 -> dBorn
// reduce psWorkArray_1_2 -> dBornPolar // reduce psWorkArray_1_2 -> dBornPolar
...@@ -1555,13 +1542,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu ) ...@@ -1555,13 +1542,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
/* /*
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData ); amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn1"); LAUNCHERROR("kReduce_dBorn1");
*/ */
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData ); amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn2"); LAUNCHERROR("kReduce_dBorn2");
} }
...@@ -1841,7 +1828,6 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1841,7 +1828,6 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
} }
kClearFields_1( amoebaGpu ); kClearFields_1( amoebaGpu );
kClearFields_3( amoebaGpu, 6 );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
...@@ -1871,78 +1857,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1871,78 +1857,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
); );
#endif #endif
} }
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_1_1->Download();
amoebaGpu->psWorkArray_1_2->Download();
/*
amoebaGpu->psLabFrameDipole->Download();
amoebaGpu->psLabFrameQuadrupole->Download();
amoebaGpu->psInducedDipoleS->Download();
amoebaGpu->psInducedDipolePolarS->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){
int indexOffset3 = ii*3;
int indexOffset9 = ii*9;
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+1],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+2],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+1],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+2] );
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+2],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+2] );
}
*/
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Target Info\n" );
(void) fflush( amoebaGpu->log );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugGk\n", targetAtom, jj );
for( int kk = 0; kk < 8; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
#endif
kReduceTorque( amoebaGpu );
if( 0 ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkArray_3_1, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorque", fileId, outputVector );
}
kReduceToBornForcePrefactor( amoebaGpu ); kReduceToBornForcePrefactor( amoebaGpu );
// map torques to forces
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
if( 0 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
...@@ -1969,15 +1887,5 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1969,15 +1887,5 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
kCalculateAmoebaKirkwoodEDiff( amoebaGpu ); kCalculateAmoebaKirkwoodEDiff( amoebaGpu );
if( 0 ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkArray_3_1, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorquePostEDiff", fileId, outputVector );
}
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
} }
...@@ -54,8 +54,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)( ...@@ -54,8 +54,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
// pWorkArray_3_1 == force // pWorkArray_3_1 == torque
// pWorkArray_3_2 == torque
// pWorkArray_1_1 == dBorn // pWorkArray_1_1 == dBorn
// pWorkArray_1_2 == dBornPolar // pWorkArray_1_2 == dBornPolar
...@@ -220,7 +219,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -220,7 +219,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
...@@ -229,7 +228,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -229,7 +228,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum; cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4);
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#endif #endif
} }
...@@ -377,7 +376,7 @@ if( mask || !mask ){ ...@@ -377,7 +376,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + warp*cSim.paddedNumberOfAtoms; offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
...@@ -390,7 +389,7 @@ if( mask || !mask ){ ...@@ -390,7 +389,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
...@@ -398,7 +397,7 @@ if( mask || !mask ){ ...@@ -398,7 +397,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum; cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms; offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
...@@ -406,7 +405,7 @@ if( mask || !mask ){ ...@@ -406,7 +405,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar; cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 ); add3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#endif #endif
lasty = y; lasty = y;
......
...@@ -936,7 +936,7 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu ) ...@@ -936,7 +936,7 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff"); LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
} }
...@@ -985,9 +985,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -985,9 +985,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
unsigned int targetAtom = 0; unsigned int targetAtom = 0;
#endif #endif
kClearFields_3( amoebaGpu, 6 );
LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");
static unsigned int threadsPerBlock = 0; static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
...@@ -1047,14 +1044,11 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1047,14 +1044,11 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif #endif
} }
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
kReduceTorque( amoebaGpu ); // reduce and map torques to forces
LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");
// map torques to forces
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque ); cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
......
...@@ -251,12 +251,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -251,12 +251,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticle.torque, outputTorque );
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticle.torque, outputTorque );
#endif #endif
...@@ -437,23 +437,22 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -437,23 +437,22 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticle.torque, outputTorque );
offset = y + tgx + warp*cSim.paddedNumberOfAtoms; offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque ); add3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque ); add3dArray( 3*offset, localParticle.torque, outputTorque );
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms; offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque ); add3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif #endif
lasty = y; lasty = y;
......
...@@ -418,22 +418,22 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu, ...@@ -418,22 +418,22 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields1"); LAUNCHERROR("kReduceMutualInducedAndGkFields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields2"); LAUNCHERROR("kReduceMutualInducedAndGkFields2");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData ); amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields3"); LAUNCHERROR("kReduceMutualInducedAndGkFields3");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData ); amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields4"); LAUNCHERROR("kReduceMutualInducedAndGkFields4");
} }
......
...@@ -220,12 +220,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl ...@@ -220,12 +220,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields1"); LAUNCHERROR("kReduceMI_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields2"); LAUNCHERROR("kReduceMI_Fields2");
} }
......
...@@ -867,15 +867,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel() ...@@ -867,15 +867,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2]; multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5]; multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phidp = &cAmoebaSim.pPhidp[20*i]; float* phidp = &cAmoebaSim.pPhidp[20*i];
cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3] cAmoebaSim.pTorque[3*i] += 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9] + 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
+ multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5] + multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
- multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]); - multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1] cAmoebaSim.pTorque[3*i+1] += 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8] + 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
+ multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6] + multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]); - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2] cAmoebaSim.pTorque[3*i+2] += 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7] + 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8] + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
- multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]); - multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
...@@ -1016,20 +1016,6 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu) ...@@ -1016,20 +1016,6 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy"); LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceOnlyFixed", fileId, outputVector );
kClearForces( gpu );
}
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque);
} }
/** /**
...@@ -1062,5 +1048,4 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu) ...@@ -1062,5 +1048,4 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy"); LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque );
} }
...@@ -1090,7 +1090,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ) ...@@ -1090,7 +1090,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 1 );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque"); LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
} }
...@@ -1185,7 +1185,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1185,7 +1185,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces"); LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
kReduceTorque( amoebaGpu ); kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
} }
...@@ -1201,5 +1200,6 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1201,5 +1200,6 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{ {
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu ); cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu ); kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
} }
...@@ -357,12 +357,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl ...@@ -357,12 +357,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields1"); LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields2"); LAUNCHERROR("kReducePmeMI_Fields2");
} }
......
...@@ -487,7 +487,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -487,7 +487,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// calculate electrostatic forces // calculate electrostatic forces
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){ if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaElectrostatic( amoebaGpu ); cudaComputeAmoebaElectrostatic( amoebaGpu, (hasAmoebaGeneralizedKirkwood ? 0 : 1) );
} else { } else {
cudaComputeAmoebaPmeElectrostatic( amoebaGpu ); cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
} }
......
...@@ -166,7 +166,7 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) ...@@ -166,7 +166,7 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif #endif
void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut ) void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo )
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -174,7 +174,7 @@ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuff ...@@ -174,7 +174,7 @@ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuff
while (pos < fieldComponents) while (pos < fieldComponents)
{ {
float totalField = 0.0f; float totalField = addTo ? fieldOut[pos] : 0.0f;
float* pFt = fieldIn + pos; float* pFt = fieldIn + pos;
unsigned int i = outputBuffers; unsigned int i = outputBuffers;
while (i >= 4) while (i >= 4)
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
__global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut ); __global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo );
__global__ void kReduceAndCombineFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2, float* fieldOut ); __global__ void kReduceAndCombineFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2, float* fieldOut );
__global__ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float4* fieldOut ); __global__ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float4* fieldOut );
......
...@@ -452,7 +452,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output ...@@ -452,7 +452,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceVdw14_7"); LAUNCHERROR("kReduceVdw14_7");
} }
......
...@@ -360,7 +360,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>* ...@@ -360,7 +360,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceWcaDispersion"); LAUNCHERROR("kReduceWcaDispersion");
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment