Commit 4a1e9683 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Optimzations and bug fix for mapping of torque to force for small molecules

parent 44bfd168
...@@ -57,6 +57,7 @@ extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() { ...@@ -57,6 +57,7 @@ extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() {
platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaForcesAndEnergyKernel::Name(), factory);
} }
} }
} }
...@@ -138,5 +139,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl ...@@ -138,5 +139,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
if (name == CalcAmoebaUreyBradleyForceKernel::Name()) if (name == CalcAmoebaUreyBradleyForceKernel::Name())
return new CudaCalcAmoebaUreyBradleyForceKernel(name, platform, *amoebaCudaData, context.getSystem()); return new CudaCalcAmoebaUreyBradleyForceKernel(name, platform, *amoebaCudaData, context.getSystem());
if (name == CalcAmoebaForcesAndEnergyKernel::Name())
return new CalcAmoebaForcesAndEnergyKernel(name, platform, *amoebaCudaData);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
} }
...@@ -44,6 +44,50 @@ extern "C" int gpuSetConstants( gpuContext gpu ); ...@@ -44,6 +44,50 @@ extern "C" int gpuSetConstants( gpuContext gpu );
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
void CalcAmoebaForcesAndEnergyKernel::initialize(const System& system) {
}
void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "In CalcAmoebaForcesAndEnergyKernel::beginComputation computeForceCount=%d inbMethod=%d GBSA=%d includeForces=%d includeEnergy=%d\n",
// data.cudaPlatformData.computeForceCount, data.cudaPlatformData.nonbondedMethod, data.getAmoebaGpu()->gpuContext->bIncludeGBSA, includeForces, includeEnergy ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){
gpuReorderAtoms(gpu);
}
data.cudaPlatformData.computeForceCount++;
if( gpu->bIncludeGBSA ){
kClearBornSumAndForces(gpu);
} else if (includeForces){
kClearForces(gpu);
}
if (includeEnergy)
kClearEnergy(gpu);
}
double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "IN CalcAmoebaForcesAndEnergyKernel::finishComputation\n" ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if( includeForces ){
kReduceForces(gpu);
}
double energy = 0.0;
if( includeEnergy ){
energy = kReduceEnergy(gpu);
}
return energy;
}
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* Calculates bonded forces * * Calculates bonded forces *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
......
...@@ -33,6 +33,49 @@ ...@@ -33,6 +33,49 @@
namespace OpenMM { namespace OpenMM {
/**
* This kernel is invoked at the beginning and end of force and energy computations. It gives the
* Platform a chance to clear buffers and do other initialization at the beginning, and to do any
* necessary work at the end to determine the final results.
*/
class CalcAmoebaForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
public:
CalcAmoebaForcesAndEnergyKernel(std::string name, const Platform& platform, AmoebaCudaData& data) : CalcForcesAndEnergyKernel(name, platform), data(data) {
}
/***
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
* any ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
*/
void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
/**
* This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
* every ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @return the potential energy of the system. This value is added to all values returned by ForceImpls'
* calcForcesAndEnergy() methods. That is, each force kernel may <i>either</i> return its contribution to the
* energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
*/
double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
private:
AmoebaCudaData& data;
};
/** /**
* This kernel is invoked by AmoebaHarmonicBondForce to calculate the forces acting on the system and the energy of the system. * This kernel is invoked by AmoebaHarmonicBondForce to calculate the forces acting on the system and the energy of the system.
*/ */
......
...@@ -358,16 +358,15 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -358,16 +358,15 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
(void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType); (void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType);
(void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->maxTorqueBufferIndex ); (void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->amoebaSim.maxTorqueBufferIndex );
totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
int memory = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log ); int memory = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log );
(void) fprintf( log, " torqueMapForce4Delete %d\n", amoebaGpu->torqueMapForce4Delete );
if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory; if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory;
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
(void) fprintf( log, " psMultipoleParticlesTorqueBufferIndices %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices);
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
(void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole); (void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole);
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
...@@ -1341,7 +1340,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu ) ...@@ -1341,7 +1340,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu ) static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static const std::string methodName = "gpuFixedEFieldAllocate"; static const std::string methodName = "gpuFixedEFieldAllocate";
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
...@@ -1869,7 +1867,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1869,7 +1867,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
} }
amoebaGpu->maxTorqueBufferIndex = maxTorqueBufferIndex; amoebaGpu->amoebaSim.maxTorqueBufferIndex = maxTorqueBufferIndex;
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex ); (void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex );
std::string axisLabel[maxAxisType+1] = { "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"}; std::string axisLabel[maxAxisType+1] = { "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"};
...@@ -2812,8 +2810,8 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener ...@@ -2812,8 +2810,8 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener
// use the Cuda force output buffers for mapping torques onto forces, if max torque buffer count < number of buffers // use the Cuda force output buffers for mapping torques onto forces, if max torque buffer count < number of buffers
if( amoebaGpu->maxTorqueBufferIndex > outputBuffers ){ if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > outputBuffers ){
amoebaGpu->psTorqueMapForce4 = new CUDAStream<float4>(paddedNumberOfAtoms*amoebaGpu->maxTorqueBufferIndex, 1, "torqueMapForce"); amoebaGpu->psTorqueMapForce4 = new CUDAStream<float4>(paddedNumberOfAtoms, amoebaGpu->amoebaSim.maxTorqueBufferIndex, "torqueMapForce");
amoebaGpu->torqueMapForce4Delete = 1; amoebaGpu->torqueMapForce4Delete = 1;
} else { } else {
amoebaGpu->psTorqueMapForce4 = amoebaGpu->gpuContext->psForce4; amoebaGpu->psTorqueMapForce4 = amoebaGpu->gpuContext->psForce4;
...@@ -3013,9 +3011,11 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -3013,9 +3011,11 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#else #else
if( debugOn && amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions\n", //if( debugOn && amoebaGpu->log ){
methodName.c_str(), numWithScalingIndices ); (void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions out of %d\n",
methodName.c_str(), numWithScalingIndices, cells );
if( debugOn ){
for (unsigned int ii = 0; ii < cells; ii++) for (unsigned int ii = 0; ii < cells; ii++)
{ {
unsigned int x, y, exclusion; unsigned int x, y, exclusion;
...@@ -3025,6 +3025,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -3025,6 +3025,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
psScalingIndicesIndex->_pSysData[ii] ); psScalingIndicesIndex->_pSysData[ii] );
} }
} }
}
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
...@@ -4471,3 +4472,32 @@ void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream ...@@ -4471,3 +4472,32 @@ void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream
outputStream->Upload(); outputStream->Upload();
} }
/**----------------------------------------------------------------------------------------
Reduce and copy CUDAStream<float> stream to CUDAStream<float> with reduction
@param streamToCopy float4 stream to copy
@param outputStream output stream
@param conversion conversion factor
--------------------------------------------------------------------------------------- */
void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>* outputStream, float conversion )
{
streamToCopy->Download();
for( unsigned int ii = 0; ii < streamToCopy->_stride; ii++ ){
outputStream->_pSysData[ii] = streamToCopy->_pSysStream[0][ii];
if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %15.7e %u %u\n", ii, streamToCopy->_pSysStream[0][ii], streamToCopy->_stride, streamToCopy->_subStreams );
for( int jj = 1; jj < streamToCopy->_subStreams; jj++ ){
if( streamToCopy->_pSysStream[jj][ii] != streamToCopy->_pSysStream[jj][ii] ){
(void) fprintf( stderr, "Nan at particle=%d stream=%d\n", ii, jj );
}
outputStream->_pSysData[ii] += streamToCopy->_pSysStream[jj][ii];
if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e %15.7e\n", ii, jj, streamToCopy->_pSysStream[jj][ii], outputStream->_pSysData[ii] );
}
outputStream->_pSysData[ii] *= conversion;
}
outputStream->Upload();
}
...@@ -166,6 +166,7 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int ...@@ -166,6 +166,7 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration); extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy ); extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion ); extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
extern void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
// PME // PME
......
...@@ -140,6 +140,7 @@ struct cudaAmoebaGmxSimulation { ...@@ -140,6 +140,7 @@ struct cudaAmoebaGmxSimulation {
int4* pMultipoleParticlesIdsAndAxisType; int4* pMultipoleParticlesIdsAndAxisType;
int4* pMultipoleParticlesTorqueBufferIndices; int4* pMultipoleParticlesTorqueBufferIndices;
int maxTorqueBufferIndex;
float4* pTorqueMapForce4; float4* pTorqueMapForce4;
float* pMolecularDipole; float* pMolecularDipole;
......
...@@ -105,7 +105,6 @@ struct _amoebaGpuContext { ...@@ -105,7 +105,6 @@ struct _amoebaGpuContext {
// buffer indices used for mapping torques onto forces // buffer indices used for mapping torques onto forces
int maxTorqueBufferIndex;
int torqueMapForce4Delete; int torqueMapForce4Delete;
CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices; CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
CUDAStream<float4>* psTorqueMapForce4; CUDAStream<float4>* psTorqueMapForce4;
......
...@@ -35,62 +35,7 @@ static int const PScaleIndex = 0; ...@@ -35,62 +35,7 @@ static int const PScaleIndex = 0;
static int const DScaleIndex = 1; static int const DScaleIndex = 1;
static int const UScaleIndex = 2; static int const UScaleIndex = 2;
static int const MScaleIndex = 3; static int const MScaleIndex = 3;
static int const Scale3Index = 4; static int const LastScalingIndex = 4;
static int const Scale5Index = 5;
static int const Scale7Index = 6;
static int const Scale9Index = 7;
static int const Ddsc30Index = 8;
//static int const Ddsc31Index = 9;
//static int const Ddsc32Index = 10;
static int const Ddsc50Index = 11;
//static int const Ddsc51Index = 12;
//static int const Ddsc52Index = 13;
static int const Ddsc70Index = 14;
//static int const Ddsc71Index = 15;
//static int const Ddsc72Index = 16;
static int const LastScalingIndex = 17;
#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define i35 0.257142857f
#define one 1.0f
__device__ void acrossProductVector3( float* vectorX, float* vectorY, float* vectorZ ){
vectorZ[0] = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
vectorZ[1] = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
vectorZ[2] = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
}
__device__ void amatrixProductVector3( float* matrixX, float* vectorY, float* vectorZ ){
vectorZ[0] = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
vectorZ[1] = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
vectorZ[2] = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
}
__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
float* xPtr[3];
float* yPtr[3];
xPtr[0] = matrixX;
xPtr[1] = matrixX + 3;
xPtr[2] = matrixX + 6;
yPtr[0] = matrixY;
yPtr[1] = matrixY + 3;
yPtr[2] = matrixY + 6;
vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
}
struct ElectrostaticParticle { struct ElectrostaticParticle {
...@@ -124,17 +69,26 @@ struct ElectrostaticParticle { ...@@ -124,17 +69,26 @@ struct ElectrostaticParticle {
float force[3]; float force[3];
float torque[3]; //float torque[3];
float padding; //float padding;
}; };
__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ, #ifdef Original
float* scalingFactors, float4* outputForce, float4 outputTorque[2]
#ifdef AMOEBA_DEBUG #define i35 0.257142857f
,float4* debugArray #define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#endif
){ #define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define one 1.0f
__device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ,
float* scalingFactors, float4* outputForce, float4 outputTorque[2]){
float deltaR[3]; float deltaR[3];
...@@ -293,37 +247,6 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato ...@@ -293,37 +247,6 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
float ei = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2); float ei = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
outputForce->w = em+ei; outputForce->w = em+ei;
#ifdef AMOEBA_DEBUG
#if 0
if( 1 ){
int debugIndex = 0;
debugArray[debugIndex].x = em;
debugArray[debugIndex].y = ei;
debugArray[debugIndex].z = rr1;
debugArray[debugIndex].w = rr3;
debugIndex++;
debugArray[debugIndex].x = gl0;
debugArray[debugIndex].y = gl1;
debugArray[debugIndex].z = gl6;
debugArray[debugIndex].w = gl2;
debugIndex++;
debugArray[debugIndex].x = gli1;
debugArray[debugIndex].y = gli3;
debugArray[debugIndex].z = gli2;
debugArray[debugIndex].w = gli7;
debugIndex++;
debugArray[debugIndex].x = psc0;
debugArray[debugIndex].y = psc1;
debugArray[debugIndex].z = psc2;
debugArray[debugIndex].w = scalingFactors[MScaleIndex];
}
#endif
#endif
float temp1[3],temp2[3],temp3[3]; float temp1[3],temp2[3],temp3[3];
float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3]; float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
amatrixProductVector3( atomI.labFrameQuadrupole, atomJ.labFrameDipole, qIdJ );//MK amatrixProductVector3( atomI.labFrameQuadrupole, atomJ.labFrameDipole, qIdJ );//MK
...@@ -528,99 +451,6 @@ if( 1 ){ ...@@ -528,99 +451,6 @@ if( 1 ){
} }
#ifdef AMOEBA_DEBUG
if( 0 ){
int debugIndex = 0;
debugArray[debugIndex].x = scalingFactors[DScaleIndex];
debugArray[debugIndex].y = scalingFactors[PScaleIndex];
debugArray[debugIndex].z = scalingFactors[MScaleIndex];
debugArray[debugIndex].w = scalingFactors[UScaleIndex];
debugIndex++;
debugArray[debugIndex].x = ftm2i_0 + (fridmp_0 + findmp_0);
debugArray[debugIndex].y = ftm2i_1 + (fridmp_1 + findmp_1);
debugArray[debugIndex].z = ftm2i_2 + (fridmp_2 + findmp_2);
debugArray[debugIndex].w = 1.5;
/*
debugIndex++;
debugArray[debugIndex].x = temp2[0];
debugArray[debugIndex].y = temp2[1];
debugArray[debugIndex].z = temp2[2];
debugArray[debugIndex].w = 2.0f;
debugIndex++;
debugArray[debugIndex].x = temp3[0];
debugArray[debugIndex].y = temp3[1];
debugArray[debugIndex].z = temp3[2];
debugArray[debugIndex].w = 3.0f;
debugIndex++;
debugArray[debugIndex].x = temp4[0];
debugArray[debugIndex].y = temp4[1];
debugArray[debugIndex].z = temp4[2];
debugArray[debugIndex].w = 4.0f;
debugIndex++;
debugArray[debugIndex].x = temp5[0];
debugArray[debugIndex].y = temp5[1];
debugArray[debugIndex].z = temp5[2];
debugArray[debugIndex].w = 5.0f;
debugIndex++;
debugArray[debugIndex].x = temp6[0];
debugArray[debugIndex].y = temp6[1];
debugArray[debugIndex].z = temp6[2];
debugArray[debugIndex].w = 6.0f;
debugIndex++;
debugArray[debugIndex].x = temp14[0];
debugArray[debugIndex].y = temp14[1];
debugArray[debugIndex].z = temp14[2];
debugArray[debugIndex].w = 14.0f;
debugIndex++;
debugArray[debugIndex].x = temp7[0];
debugArray[debugIndex].y = temp7[1];
debugArray[debugIndex].z = temp7[2];
debugArray[debugIndex].w = 7.0f;
debugIndex++;
debugArray[debugIndex].x = temp8[0];
debugArray[debugIndex].y = temp8[1];
debugArray[debugIndex].z = temp8[2];
debugArray[debugIndex].w = 8.0f;
debugIndex++;
debugArray[debugIndex].x = rr3;
debugArray[debugIndex].y = gf3;
debugArray[debugIndex].z = gf6;
debugArray[debugIndex].w = 20.0f;
debugIndex++;
debugArray[debugIndex].x = gf4;
debugArray[debugIndex].y = gf7;
debugArray[debugIndex].z = 0.0f;
debugArray[debugIndex].w = 21.0f;
debugIndex++;
debugArray[debugIndex].x = atomJ.labFrameDipole[0];
debugArray[debugIndex].y = atomJ.labFrameDipole[1];
debugArray[debugIndex].z = atomJ.labFrameDipole[2];
debugArray[debugIndex].w = 22.0f;
debugIndex++;
debugArray[debugIndex].x = deltaR[0];
debugArray[debugIndex].y = deltaR[1];
debugArray[debugIndex].z = deltaR[2];
debugArray[debugIndex].w = 23.0f;
*/
}
#endif
outputForce->x = -(ftm2_0 + ftm2i_0); outputForce->x = -(ftm2_0 + ftm2i_0);
outputForce->y = -(ftm2_1 + ftm2i_1); outputForce->y = -(ftm2_1 + ftm2i_1);
outputForce->z = -(ftm2_2 + ftm2i_2); outputForce->z = -(ftm2_2 + ftm2i_2);
...@@ -636,50 +466,124 @@ int debugIndex = 0; ...@@ -636,50 +466,124 @@ int debugIndex = 0;
return; return;
} }
#endif
static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* sA, unsigned int atomI ){
__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
// coordinates & charge // coordinates & charge
sA->x = atomCoord[atomI].x; sA->x = cSim.pPosq[atomI].x;
sA->y = atomCoord[atomI].y; sA->y = cSim.pPosq[atomI].y;
sA->z = atomCoord[atomI].z; sA->z = cSim.pPosq[atomI].z;
sA->q = atomCoord[atomI].w; sA->q = cSim.pPosq[atomI].w;
// lab dipole // lab dipole
sA->labFrameDipole[0] = labFrameDipoleJ[atomI*3]; sA->labFrameDipole[0] = cAmoebaSim.pLabFrameDipole[atomI*3];
sA->labFrameDipole[1] = labFrameDipoleJ[atomI*3+1]; sA->labFrameDipole[1] = cAmoebaSim.pLabFrameDipole[atomI*3+1];
sA->labFrameDipole[2] = labFrameDipoleJ[atomI*3+2]; sA->labFrameDipole[2] = cAmoebaSim.pLabFrameDipole[atomI*3+2];
// lab quadrupole // lab quadrupole
sA->labFrameQuadrupole[0] = labQuadrupole[atomI*9]; sA->labFrameQuadrupole[0] = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
sA->labFrameQuadrupole[1] = labQuadrupole[atomI*9+1]; sA->labFrameQuadrupole[1] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
sA->labFrameQuadrupole[2] = labQuadrupole[atomI*9+2]; sA->labFrameQuadrupole[2] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
sA->labFrameQuadrupole[3] = labQuadrupole[atomI*9+3]; sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
sA->labFrameQuadrupole[4] = labQuadrupole[atomI*9+4]; sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[5] = labQuadrupole[atomI*9+5]; sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[6] = labQuadrupole[atomI*9+6]; sA->labFrameQuadrupole[6] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
sA->labFrameQuadrupole[7] = labQuadrupole[atomI*9+7]; sA->labFrameQuadrupole[7] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
sA->labFrameQuadrupole[8] = labQuadrupole[atomI*9+8]; sA->labFrameQuadrupole[8] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
// induced dipole // induced dipole
sA->inducedDipole[0] = inducedDipole[atomI*3]; sA->inducedDipole[0] = cAmoebaSim.pInducedDipole[atomI*3];
sA->inducedDipole[1] = inducedDipole[atomI*3+1]; sA->inducedDipole[1] = cAmoebaSim.pInducedDipole[atomI*3+1];
sA->inducedDipole[2] = inducedDipole[atomI*3+2]; sA->inducedDipole[2] = cAmoebaSim.pInducedDipole[atomI*3+2];
// induced dipole polar // induced dipole polar
sA->inducedDipoleP[0] = inducedDipolePolar[atomI*3]; sA->inducedDipoleP[0] = cAmoebaSim.pInducedDipolePolar[atomI*3];
sA->inducedDipoleP[1] = inducedDipolePolar[atomI*3+1]; sA->inducedDipoleP[1] = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
sA->inducedDipoleP[2] = inducedDipolePolar[atomI*3+2]; sA->inducedDipoleP[2] = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
sA->damp = dampingFactorAndThole[atomI].x; sA->damp = cAmoebaSim.pDampingFactorAndThole[atomI].x;
sA->thole = dampingFactorAndThole[atomI].y; sA->thole = cAmoebaSim.pDampingFactorAndThole[atomI].y;
}
static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){
// coordinates & charge
sA->force[0] = 0.0f;
sA->force[1] = 0.0f;
sA->force[2] = 0.0f;
/*
sA->torque[0] = 0.0f;
sA->torque[1] = 0.0f;
sA->torque[2] = 0.0f;
*/
}
#undef SUB_METHOD_NAME
#undef F1
#define SUB_METHOD_NAME(a, b) a##F1##b
#define F1
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef F1
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef F2
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
//#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ,
float* scalingFactors, float4* outputForce, float4 outputTorque[2], float forceFactor){
#ifdef Orig
return calculateElectrostaticPairIxn_kernel( atomI, atomJ, scalingFactors, outputForce, outputTorque);
#else
float force[3];
float energy;
calculateElectrostaticPairIxnF1_kernel( atomI, atomJ, scalingFactors, &energy, force);
outputForce->x = force[0];
outputForce->y = force[1];
outputForce->z = force[2];
outputForce->w = energy;
calculateElectrostaticPairIxnT1_kernel( atomI, atomJ, scalingFactors, force);
outputTorque[0].x = force[0];
outputTorque[0].y = force[1];
outputTorque[0].z = force[2];
calculateElectrostaticPairIxnT3_kernel( atomI, atomJ, scalingFactors, force);
outputTorque[1].x = force[0];
outputTorque[1].y = force[1];
outputTorque[1].z = force[2];
return;
#endif
} }
...@@ -754,7 +658,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo ...@@ -754,7 +658,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
maxThreads = 384; //maxThreads = 384;
maxThreads = 512;
else if (gpu->sm_version >= SM_12) else if (gpu->sm_version >= SM_12)
maxThreads = 128; maxThreads = 128;
else else
...@@ -773,52 +678,38 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo ...@@ -773,52 +678,38 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else { } else {
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
if( addTorqueToForce ){
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
}
if( 0 ){ if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 ); reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 ); cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
reduceAndCopyCUDAStreamFloat( amoebaGpu->psWorkArray_3_1, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
delete temp;
}
if( addTorqueToForce ){
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
} }
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
......
...@@ -85,13 +85,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -85,13 +85,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
// N2 debug array // N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(10*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
// print intermediate results for the targetAtom // print intermediate results for the targetAtom
unsigned int targetAtom = 3; unsigned int targetAtom = 3;
...@@ -201,6 +198,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -201,6 +198,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" ); (void) fprintf( amoebaGpu->log, "EFields End\n" );
/*
(void) fprintf( amoebaGpu->log, "DebugQ\n" ); (void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download(); debugArray->Download();
if( 0 ){ if( 0 ){
...@@ -256,23 +254,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -256,23 +254,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
sum[1][1] += debugArray->_pSysData[debugIndex].y; sum[1][1] += debugArray->_pSysData[debugIndex].y;
sum[1][2] += debugArray->_pSysData[debugIndex].z; sum[1][2] += debugArray->_pSysData[debugIndex].z;
/*
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
*/
} }
(void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
sum[0][0], sum[0][1], sum[0][2], sum[0][0], sum[0][1], sum[0][2],
...@@ -301,10 +282,11 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -301,10 +282,11 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
debugArray->_pSysData[ii].w ); debugArray->_pSysData[ii].w );
} }
} }
*/
// write results to file // write results to file
if( 1 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
...@@ -314,7 +296,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -314,7 +296,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
delete debugArray; //delete debugArray;
} }
#endif #endif
......
#ifndef AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
#define AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
struct KirkwoodEDiffParticle {
// coordinates charge
float x;
float y;
float z;
float q;
// scaling factor
float thole;
float damp;
// lab frame dipole
float labFrameDipole[3];
// lab frame quadrupole
float labFrameQuadrupole_XX;
float labFrameQuadrupole_XY;
float labFrameQuadrupole_XZ;
float labFrameQuadrupole_YY;
float labFrameQuadrupole_YZ;
float labFrameQuadrupole_ZZ;
// induced dipole and polar counterpart
float inducedDipole[3];
float inducedDipoleP[3];
// solvent induced dipole and polar counterpart
float inducedDipoleS[3];
float inducedDipolePS[3];
// Born radii
float force[3];
// float torque[3];
};
#endif
...@@ -36,55 +36,13 @@ struct KirkwoodParticle { ...@@ -36,55 +36,13 @@ struct KirkwoodParticle {
float bornRadius; float bornRadius;
float force[3]; float force[3];
#ifdef INCLUDE_TORQUE
float torque[3]; float torque[3];
#endif
float dBornRadius; float dBornRadius;
float dBornRadiusPolar; float dBornRadiusPolar;
float padding; // float padding;
};
struct KirkwoodEDiffParticle {
// coordinates charge
float x;
float y;
float z;
float q;
// scaling factor
float thole;
float damp;
// lab frame dipole
float labFrameDipole[3];
// lab frame quadrupole
float labFrameQuadrupole_XX;
float labFrameQuadrupole_XY;
float labFrameQuadrupole_XZ;
float labFrameQuadrupole_YY;
float labFrameQuadrupole_YZ;
float labFrameQuadrupole_ZZ;
// induced dipole and polar counterpart
float inducedDipole[3];
float inducedDipoleP[3];
// solvent induced dipole and polar counterpart
float inducedDipoleS[3];
float inducedDipolePS[3];
// Born radii
float force[3];
float torque[3];
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment