Commit 1aff5bcb authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Torques now mapped to forces only once per force calculation

parent 101f206d
......@@ -793,17 +793,12 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
if( data.getHasAmoebaGeneralizedKirkwood() ){
kCalculateObcGbsaBornSum(gpu->gpuContext);
kReduceObcGbsaBornSum(gpu->gpuContext);
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psBornRadii, 0.1 );
//initializeCudaFloatArray( gpu->gpuContext->natoms, 1, gpu->gpuContext->psObcChain, 0.0 );
}
// multipoles
kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
//kClearForces(gpu->gpuContext);
//kClearEnergy(gpu->gpuContext);
// GK
if( data.getHasAmoebaGeneralizedKirkwood() ){
......
......@@ -104,7 +104,7 @@ extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::
extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );
extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce );
extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
......
......@@ -705,7 +705,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
LAUNCHERROR("kReduceElectrostaticTorque");
}
......@@ -718,7 +718,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
--------------------------------------------------------------------------------------- */
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
{
// ---------------------------------------------------------------------------------------
......@@ -768,7 +768,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
}
kClearFields_3( amoebaGpu, 1 );
LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......@@ -812,117 +811,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque");
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psTorque->Download();
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished Electrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// torque
(void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
for( int kk = 0; kk < 8; kk++ ){
float conversion = kk >= 1 && kk <= 8 ? 1.0f/4.184f : 1.0;
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
conversion*debugArray->_pSysData[debugIndex].x, conversion*debugArray->_pSysData[debugIndex].y,
conversion*debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
if( 1 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex1 = jj + paddedNumberOfAtoms;
int debugIndex2 = jj + 5*paddedNumberOfAtoms;
int debugIndex3 = jj + 6*paddedNumberOfAtoms;
int debugIndex4 = jj + 4*paddedNumberOfAtoms;
int debugIndex5 = jj + 7*paddedNumberOfAtoms;
float conversion = 1.0f/4.184f;
int i1,i2;
if( jj < targetAtom ){
i1 = jj;
i2 = targetAtom;
} else {
i1 = targetAtom;
i2 = jj;
}
(void) fprintf( amoebaGpu->log,"%5d %5d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e F11\n", i1,i2,
conversion*debugArray->_pSysData[debugIndex1].x,
conversion*debugArray->_pSysData[debugIndex1].y,
conversion*debugArray->_pSysData[debugIndex1].z,
conversion*debugArray->_pSysData[debugIndex2].x,
conversion*debugArray->_pSysData[debugIndex2].y,
conversion*debugArray->_pSysData[debugIndex2].z,
conversion*debugArray->_pSysData[debugIndex3].x,
conversion*debugArray->_pSysData[debugIndex3].y,
conversion*debugArray->_pSysData[debugIndex3].z,
conversion*debugArray->_pSysData[debugIndex5].x,
conversion*debugArray->_pSysData[debugIndex5].y,
conversion*debugArray->_pSysData[debugIndex5].z );
}
}
(void) fflush( amoebaGpu->log );
if( 0 ){
(void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
int maxPrint = 12;
for( int ii = 0; ii < gpu->natoms; ii++ ){
// print cpu & gpu reductions
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
if( addTorqueToForce ){
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
}
if( 0 ){
std::vector<int> fileId;
......
......@@ -41,17 +41,17 @@ static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields2");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData, 0 );
LAUNCHERROR("kReduceEAndGK_Fields3");
}
......
......@@ -39,12 +39,12 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
LAUNCHERROR("kReduceE_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
LAUNCHERROR("kReduceE_Fields2");
}
......
......@@ -1531,19 +1531,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaKirkwood.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
static void kReduceTorque(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceTorque");
}
// reduce psWorkArray_1_1 -> dBorn
// reduce psWorkArray_1_2 -> dBornPolar
......@@ -1555,13 +1542,13 @@ static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
/*
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData );
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn1");
*/
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData );
amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn2");
}
......@@ -1841,7 +1828,6 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
}
kClearFields_1( amoebaGpu );
kClearFields_3( amoebaGpu, 6 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
......@@ -1871,78 +1857,10 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
);
#endif
}
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_1_1->Download();
amoebaGpu->psWorkArray_1_2->Download();
/*
amoebaGpu->psLabFrameDipole->Download();
amoebaGpu->psLabFrameQuadrupole->Download();
amoebaGpu->psInducedDipoleS->Download();
amoebaGpu->psInducedDipolePolarS->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){
int indexOffset3 = ii*3;
int indexOffset9 = ii*9;
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+1],
amoebaGpu->psLabFrameDipole->_pSysData[indexOffset3+2],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+1],
amoebaGpu->psLabFrameQuadrupole->_pSysData[indexOffset9+2] );
(void) fprintf( amoebaGpu->log, "%5d [%14.7e %14.7e %14.7e] q[%14.7e %14.7e %14.7e]\n", ii,
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipoleS->_pSysData[indexOffset3+2],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+1],
amoebaGpu->psInducedDipolePolarS->_pSysData[indexOffset3+2] );
}
*/
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Target Info\n" );
(void) fflush( amoebaGpu->log );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugGk\n", targetAtom, jj );
for( int kk = 0; kk < 8; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
#endif
kReduceTorque( amoebaGpu );
if( 0 ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkArray_3_1, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorque", fileId, outputVector );
}
kReduceToBornForcePrefactor( amoebaGpu );
// map torques to forces
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
if( 0 ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
......@@ -1969,15 +1887,5 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
kCalculateAmoebaKirkwoodEDiff( amoebaGpu );
if( 0 ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, amoebaGpu->psWorkArray_3_1, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkArray_3_1, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodForceTorquePostEDiff", fileId, outputVector );
}
// ---------------------------------------------------------------------------------------
}
......@@ -54,8 +54,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
// pWorkArray_3_1 == force
// pWorkArray_3_2 == torque
// pWorkArray_3_1 == torque
// pWorkArray_1_1 == dBorn
// pWorkArray_1_2 == dBornPolar
......@@ -220,7 +219,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
......@@ -229,7 +228,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4);
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#endif
}
......@@ -377,7 +376,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
......@@ -390,7 +389,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
......@@ -398,7 +397,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
......@@ -406,7 +405,7 @@ if( mask || !mask ){
cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
add3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#endif
lasty = y;
......
......@@ -936,7 +936,7 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
}
......@@ -985,9 +985,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
unsigned int targetAtom = 0;
#endif
kClearFields_3( amoebaGpu, 6 );
LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
......@@ -1047,14 +1044,11 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");
// map torques to forces
// reduce and map torques to forces
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
......
......@@ -251,12 +251,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
add3dArray( 3*offset, localParticle.torque, outputTorque );
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
add3dArray( 3*offset, localParticle.torque, outputTorque );
#endif
......@@ -437,23 +437,22 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
add3dArray( 3*offset, localParticle.torque, outputTorque );
offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque );
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
add3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
add3dArray( 3*offset, localParticle.torque, outputTorque );
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
add3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif
lasty = y;
......
......@@ -418,22 +418,22 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields2");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields3");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData, 0 );
LAUNCHERROR("kReduceMutualInducedAndGkFields4");
}
......
......@@ -220,12 +220,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields2");
}
......
......@@ -867,15 +867,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole[8] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+2];
multipole[9] = 2*cAmoebaSim.pLabFrameQuadrupole[i*9+5];
float* phidp = &cAmoebaSim.pPhidp[20*i];
cAmoebaSim.pTorque[3*i] = 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
cAmoebaSim.pTorque[3*i] += 0.5f*cAmoebaSim.electric*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2.0f*(multipole[6]-multipole[5])*zscale*zscale*phidp[9]
+ multipole[8]*yscale*yscale*phidp[7] + multipole[9]*xscale*yscale*phidp[5]
- multipole[7]*yscale*zscale*phidp[8] - multipole[9]*xscale*zscale*phidp[6]);
cAmoebaSim.pTorque[3*i+1] = 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
cAmoebaSim.pTorque[3*i+1] += 0.5f*cAmoebaSim.electric*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2.0f*(multipole[4]-multipole[6])*zscale*zscale*phidp[8]
+ multipole[7]*zscale*zscale*phidp[9] + multipole[8]*xscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*yscale*yscale*phidp[7]);
cAmoebaSim.pTorque[3*i+2] = 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
cAmoebaSim.pTorque[3*i+2] += 0.5f*cAmoebaSim.electric*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2.0f*(multipole[5]-multipole[4])*yscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*yscale*zscale*phidp[8]
- multipole[7]*xscale*yscale*phidp[5] - multipole[8]*zscale*zscale*phidp[9]);
......@@ -1016,20 +1016,6 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceOnlyFixed", fileId, outputVector );
kClearForces( gpu );
}
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque);
}
/**
......@@ -1062,5 +1048,4 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque );
}
......@@ -1090,7 +1090,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 1 );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
}
......@@ -1185,7 +1185,6 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
}
......@@ -1201,5 +1200,6 @@ void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
}
......@@ -357,12 +357,12 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields2");
}
......
......@@ -487,7 +487,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// calculate electrostatic forces
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaElectrostatic( amoebaGpu );
cudaComputeAmoebaElectrostatic( amoebaGpu, (hasAmoebaGeneralizedKirkwood ? 0 : 1) );
} else {
cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
}
......
......@@ -166,7 +166,7 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut )
void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -174,7 +174,7 @@ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuff
while (pos < fieldComponents)
{
float totalField = 0.0f;
float totalField = addTo ? fieldOut[pos] : 0.0f;
float* pFt = fieldIn + pos;
unsigned int i = outputBuffers;
while (i >= 4)
......
......@@ -3,7 +3,7 @@
#include "amoebaCudaKernels.h"
__global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut );
__global__ void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo );
__global__ void kReduceAndCombineFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2, float* fieldOut );
__global__ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float4* fieldOut );
......
......@@ -452,7 +452,7 @@ static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* output
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceVdw14_7");
}
......
......@@ -360,7 +360,7 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceWcaDispersion");
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment