"...reference/src/SimTKReference/ReferenceShakeAlgorithm.h" did not exist on "76e2849ccf0aea4dd118a77e8d7d7e66b1107ab0"
Commit 4a1e9683 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Optimzations and bug fix for mapping of torque to force for small molecules

parent 44bfd168
...@@ -57,6 +57,7 @@ extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() { ...@@ -57,6 +57,7 @@ extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() {
platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory); platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory);
platform.registerKernelFactory(CalcAmoebaForcesAndEnergyKernel::Name(), factory);
} }
} }
} }
...@@ -138,5 +139,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl ...@@ -138,5 +139,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
if (name == CalcAmoebaUreyBradleyForceKernel::Name()) if (name == CalcAmoebaUreyBradleyForceKernel::Name())
return new CudaCalcAmoebaUreyBradleyForceKernel(name, platform, *amoebaCudaData, context.getSystem()); return new CudaCalcAmoebaUreyBradleyForceKernel(name, platform, *amoebaCudaData, context.getSystem());
if (name == CalcAmoebaForcesAndEnergyKernel::Name())
return new CalcAmoebaForcesAndEnergyKernel(name, platform, *amoebaCudaData);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
} }
...@@ -44,6 +44,50 @@ extern "C" int gpuSetConstants( gpuContext gpu ); ...@@ -44,6 +44,50 @@ extern "C" int gpuSetConstants( gpuContext gpu );
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
void CalcAmoebaForcesAndEnergyKernel::initialize(const System& system) {
}
void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "In CalcAmoebaForcesAndEnergyKernel::beginComputation computeForceCount=%d inbMethod=%d GBSA=%d includeForces=%d includeEnergy=%d\n",
// data.cudaPlatformData.computeForceCount, data.cudaPlatformData.nonbondedMethod, data.getAmoebaGpu()->gpuContext->bIncludeGBSA, includeForces, includeEnergy ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){
gpuReorderAtoms(gpu);
}
data.cudaPlatformData.computeForceCount++;
if( gpu->bIncludeGBSA ){
kClearBornSumAndForces(gpu);
} else if (includeForces){
kClearForces(gpu);
}
if (includeEnergy)
kClearEnergy(gpu);
}
double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "IN CalcAmoebaForcesAndEnergyKernel::finishComputation\n" ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if( includeForces ){
kReduceForces(gpu);
}
double energy = 0.0;
if( includeEnergy ){
energy = kReduceEnergy(gpu);
}
return energy;
}
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* Calculates bonded forces * * Calculates bonded forces *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
......
...@@ -33,6 +33,49 @@ ...@@ -33,6 +33,49 @@
namespace OpenMM { namespace OpenMM {
/**
* This kernel is invoked at the beginning and end of force and energy computations. It gives the
* Platform a chance to clear buffers and do other initialization at the beginning, and to do any
* necessary work at the end to determine the final results.
*/
class CalcAmoebaForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
public:
CalcAmoebaForcesAndEnergyKernel(std::string name, const Platform& platform, AmoebaCudaData& data) : CalcForcesAndEnergyKernel(name, platform), data(data) {
}
/***
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
* any ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
*/
void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
/**
* This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
* every ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @return the potential energy of the system. This value is added to all values returned by ForceImpls'
* calcForcesAndEnergy() methods. That is, each force kernel may <i>either</i> return its contribution to the
* energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
*/
double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy);
private:
AmoebaCudaData& data;
};
/** /**
* This kernel is invoked by AmoebaHarmonicBondForce to calculate the forces acting on the system and the energy of the system. * This kernel is invoked by AmoebaHarmonicBondForce to calculate the forces acting on the system and the energy of the system.
*/ */
...@@ -72,9 +115,9 @@ private: ...@@ -72,9 +115,9 @@ private:
class CudaCalcAmoebaUreyBradleyForceKernel : public CalcAmoebaUreyBradleyForceKernel { class CudaCalcAmoebaUreyBradleyForceKernel : public CalcAmoebaUreyBradleyForceKernel {
public: public:
CudaCalcAmoebaUreyBradleyForceKernel(std::string name, CudaCalcAmoebaUreyBradleyForceKernel(std::string name,
const Platform& platform, const Platform& platform,
AmoebaCudaData& data, AmoebaCudaData& data,
System& system); System& system);
~CudaCalcAmoebaUreyBradleyForceKernel(); ~CudaCalcAmoebaUreyBradleyForceKernel();
/** /**
* Initialize the kernel. * Initialize the kernel.
......
...@@ -358,16 +358,15 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -358,16 +358,15 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesIdsAndAxisType, log );
(void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType); (void) fprintf( log, " pMultipoleParticlesIdsAndAxisType %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesIdsAndAxisType);
(void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->maxTorqueBufferIndex ); (void) fprintf( log, " maxTorqueBufferIndex %d\n", amoebaGpu->amoebaSim.maxTorqueBufferIndex );
totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log ); totalMemory += gpuPrintCudaStreamInt4( amoebaGpu->psMultipoleParticlesTorqueBufferIndices, log );
int memory = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log ); int memory = gpuPrintCudaStreamFloat4( amoebaGpu->psTorqueMapForce4, log );
(void) fprintf( log, " torqueMapForce4Delete %d\n", amoebaGpu->torqueMapForce4Delete );
if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory; if( amoebaGpu->torqueMapForce4Delete )totalMemory += memory;
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
(void) fprintf( log, " psMultipoleParticlesTorqueBufferIndices %p\n", amoebaGpu->amoebaSim.pMultipoleParticlesTorqueBufferIndices);
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularDipole, log );
(void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole); (void) fprintf( log, " pMolecularDipole %p\n", amoebaGpu->amoebaSim.pMolecularDipole);
totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log ); totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psMolecularQuadrupole, log );
...@@ -1341,7 +1340,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu ) ...@@ -1341,7 +1340,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu ) static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
{ {
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static const std::string methodName = "gpuFixedEFieldAllocate"; static const std::string methodName = "gpuFixedEFieldAllocate";
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
...@@ -1869,7 +1867,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect ...@@ -1869,7 +1867,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
} }
amoebaGpu->maxTorqueBufferIndex = maxTorqueBufferIndex; amoebaGpu->amoebaSim.maxTorqueBufferIndex = maxTorqueBufferIndex;
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex ); (void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex );
std::string axisLabel[maxAxisType+1] = { "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"}; std::string axisLabel[maxAxisType+1] = { "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"};
...@@ -2812,14 +2810,14 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener ...@@ -2812,14 +2810,14 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu, int hasAmoebaGener
// use the Cuda force output buffers for mapping torques onto forces, if max torque buffer count < number of buffers // use the Cuda force output buffers for mapping torques onto forces, if max torque buffer count < number of buffers
if( amoebaGpu->maxTorqueBufferIndex > outputBuffers ){ if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > outputBuffers ){
amoebaGpu->psTorqueMapForce4 = new CUDAStream<float4>(paddedNumberOfAtoms*amoebaGpu->maxTorqueBufferIndex, 1, "torqueMapForce"); amoebaGpu->psTorqueMapForce4 = new CUDAStream<float4>(paddedNumberOfAtoms, amoebaGpu->amoebaSim.maxTorqueBufferIndex, "torqueMapForce");
amoebaGpu->torqueMapForce4Delete = 1; amoebaGpu->torqueMapForce4Delete = 1;
} else { } else {
amoebaGpu->psTorqueMapForce4 = amoebaGpu->gpuContext->psForce4; amoebaGpu->psTorqueMapForce4 = amoebaGpu->gpuContext->psForce4;
amoebaGpu->torqueMapForce4Delete = 0; amoebaGpu->torqueMapForce4Delete = 0;
} }
amoebaGpu->amoebaSim.pTorqueMapForce4 = amoebaGpu->psTorqueMapForce4->_pDevData; amoebaGpu->amoebaSim.pTorqueMapForce4 = amoebaGpu->psTorqueMapForce4->_pDevData;
return; return;
} }
...@@ -3013,16 +3011,19 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -3013,16 +3011,19 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#else #else
if( debugOn && amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions\n", //if( debugOn && amoebaGpu->log ){
methodName.c_str(), numWithScalingIndices ); (void) fprintf( amoebaGpu->log, "%s %d cells w/ exclusions out of %d\n",
for (unsigned int ii = 0; ii < cells; ii++) methodName.c_str(), numWithScalingIndices, cells );
{ if( debugOn ){
unsigned int x, y, exclusion; for (unsigned int ii = 0; ii < cells; ii++)
decodeCell( pWorkList[ii], &x, &y, &exclusion ); {
if( exclusion ){ unsigned int x, y, exclusion;
(void) fprintf( amoebaGpu->log, "%6d [%6u %6u] %8u %8u indexInToIndices=%8d\n", ii, x, y, exclusion, pWorkList[ii], decodeCell( pWorkList[ii], &x, &y, &exclusion );
psScalingIndicesIndex->_pSysData[ii] ); if( exclusion ){
(void) fprintf( amoebaGpu->log, "%6d [%6u %6u] %8u %8u indexInToIndices=%8d\n", ii, x, y, exclusion, pWorkList[ii],
psScalingIndicesIndex->_pSysData[ii] );
}
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -4471,3 +4472,32 @@ void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream ...@@ -4471,3 +4472,32 @@ void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream
outputStream->Upload(); outputStream->Upload();
} }
/**----------------------------------------------------------------------------------------
Reduce and copy CUDAStream<float> stream to CUDAStream<float> with reduction
@param streamToCopy float4 stream to copy
@param outputStream output stream
@param conversion conversion factor
--------------------------------------------------------------------------------------- */
void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>* outputStream, float conversion )
{
streamToCopy->Download();
for( unsigned int ii = 0; ii < streamToCopy->_stride; ii++ ){
outputStream->_pSysData[ii] = streamToCopy->_pSysStream[0][ii];
if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %15.7e %u %u\n", ii, streamToCopy->_pSysStream[0][ii], streamToCopy->_stride, streamToCopy->_subStreams );
for( int jj = 1; jj < streamToCopy->_subStreams; jj++ ){
if( streamToCopy->_pSysStream[jj][ii] != streamToCopy->_pSysStream[jj][ii] ){
(void) fprintf( stderr, "Nan at particle=%d stream=%d\n", ii, jj );
}
outputStream->_pSysData[ii] += streamToCopy->_pSysStream[jj][ii];
if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e %15.7e\n", ii, jj, streamToCopy->_pSysStream[jj][ii], outputStream->_pSysData[ii] );
}
outputStream->_pSysData[ii] *= conversion;
}
outputStream->Upload();
}
...@@ -166,6 +166,7 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int ...@@ -166,6 +166,7 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration); extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy ); extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion ); extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
extern void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
// PME // PME
......
...@@ -140,6 +140,7 @@ struct cudaAmoebaGmxSimulation { ...@@ -140,6 +140,7 @@ struct cudaAmoebaGmxSimulation {
int4* pMultipoleParticlesIdsAndAxisType; int4* pMultipoleParticlesIdsAndAxisType;
int4* pMultipoleParticlesTorqueBufferIndices; int4* pMultipoleParticlesTorqueBufferIndices;
int maxTorqueBufferIndex;
float4* pTorqueMapForce4; float4* pTorqueMapForce4;
float* pMolecularDipole; float* pMolecularDipole;
......
...@@ -105,7 +105,6 @@ struct _amoebaGpuContext { ...@@ -105,7 +105,6 @@ struct _amoebaGpuContext {
// buffer indices used for mapping torques onto forces // buffer indices used for mapping torques onto forces
int maxTorqueBufferIndex;
int torqueMapForce4Delete; int torqueMapForce4Delete;
CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices; CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
CUDAStream<float4>* psTorqueMapForce4; CUDAStream<float4>* psTorqueMapForce4;
......
...@@ -35,62 +35,7 @@ static int const PScaleIndex = 0; ...@@ -35,62 +35,7 @@ static int const PScaleIndex = 0;
static int const DScaleIndex = 1; static int const DScaleIndex = 1;
static int const UScaleIndex = 2; static int const UScaleIndex = 2;
static int const MScaleIndex = 3; static int const MScaleIndex = 3;
static int const Scale3Index = 4; static int const LastScalingIndex = 4;
static int const Scale5Index = 5;
static int const Scale7Index = 6;
static int const Scale9Index = 7;
static int const Ddsc30Index = 8;
//static int const Ddsc31Index = 9;
//static int const Ddsc32Index = 10;
static int const Ddsc50Index = 11;
//static int const Ddsc51Index = 12;
//static int const Ddsc52Index = 13;
static int const Ddsc70Index = 14;
//static int const Ddsc71Index = 15;
//static int const Ddsc72Index = 16;
static int const LastScalingIndex = 17;
#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define i35 0.257142857f
#define one 1.0f
__device__ void acrossProductVector3( float* vectorX, float* vectorY, float* vectorZ ){
vectorZ[0] = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
vectorZ[1] = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
vectorZ[2] = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
}
__device__ void amatrixProductVector3( float* matrixX, float* vectorY, float* vectorZ ){
vectorZ[0] = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
vectorZ[1] = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
vectorZ[2] = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
}
__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
float* xPtr[3];
float* yPtr[3];
xPtr[0] = matrixX;
xPtr[1] = matrixX + 3;
xPtr[2] = matrixX + 6;
yPtr[0] = matrixY;
yPtr[1] = matrixY + 3;
yPtr[2] = matrixY + 6;
vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
}
struct ElectrostaticParticle { struct ElectrostaticParticle {
...@@ -124,17 +69,26 @@ struct ElectrostaticParticle { ...@@ -124,17 +69,26 @@ struct ElectrostaticParticle {
float force[3]; float force[3];
float torque[3]; //float torque[3];
float padding; //float padding;
}; };
__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ, #ifdef Original
float* scalingFactors, float4* outputForce, float4 outputTorque[2]
#ifdef AMOEBA_DEBUG #define i35 0.257142857f
,float4* debugArray #define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#endif
){ #define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
#define one 1.0f
__device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ,
float* scalingFactors, float4* outputForce, float4 outputTorque[2]){
float deltaR[3]; float deltaR[3];
...@@ -293,37 +247,6 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato ...@@ -293,37 +247,6 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
float ei = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2); float ei = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
outputForce->w = em+ei; outputForce->w = em+ei;
#ifdef AMOEBA_DEBUG
#if 0
if( 1 ){
int debugIndex = 0;
debugArray[debugIndex].x = em;
debugArray[debugIndex].y = ei;
debugArray[debugIndex].z = rr1;
debugArray[debugIndex].w = rr3;
debugIndex++;
debugArray[debugIndex].x = gl0;
debugArray[debugIndex].y = gl1;
debugArray[debugIndex].z = gl6;
debugArray[debugIndex].w = gl2;
debugIndex++;
debugArray[debugIndex].x = gli1;
debugArray[debugIndex].y = gli3;
debugArray[debugIndex].z = gli2;
debugArray[debugIndex].w = gli7;
debugIndex++;
debugArray[debugIndex].x = psc0;
debugArray[debugIndex].y = psc1;
debugArray[debugIndex].z = psc2;
debugArray[debugIndex].w = scalingFactors[MScaleIndex];
}
#endif
#endif
float temp1[3],temp2[3],temp3[3]; float temp1[3],temp2[3],temp3[3];
float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3]; float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
amatrixProductVector3( atomI.labFrameQuadrupole, atomJ.labFrameDipole, qIdJ );//MK amatrixProductVector3( atomI.labFrameQuadrupole, atomJ.labFrameDipole, qIdJ );//MK
...@@ -528,99 +451,6 @@ if( 1 ){ ...@@ -528,99 +451,6 @@ if( 1 ){
} }
#ifdef AMOEBA_DEBUG
if( 0 ){
int debugIndex = 0;
debugArray[debugIndex].x = scalingFactors[DScaleIndex];
debugArray[debugIndex].y = scalingFactors[PScaleIndex];
debugArray[debugIndex].z = scalingFactors[MScaleIndex];
debugArray[debugIndex].w = scalingFactors[UScaleIndex];
debugIndex++;
debugArray[debugIndex].x = ftm2i_0 + (fridmp_0 + findmp_0);
debugArray[debugIndex].y = ftm2i_1 + (fridmp_1 + findmp_1);
debugArray[debugIndex].z = ftm2i_2 + (fridmp_2 + findmp_2);
debugArray[debugIndex].w = 1.5;
/*
debugIndex++;
debugArray[debugIndex].x = temp2[0];
debugArray[debugIndex].y = temp2[1];
debugArray[debugIndex].z = temp2[2];
debugArray[debugIndex].w = 2.0f;
debugIndex++;
debugArray[debugIndex].x = temp3[0];
debugArray[debugIndex].y = temp3[1];
debugArray[debugIndex].z = temp3[2];
debugArray[debugIndex].w = 3.0f;
debugIndex++;
debugArray[debugIndex].x = temp4[0];
debugArray[debugIndex].y = temp4[1];
debugArray[debugIndex].z = temp4[2];
debugArray[debugIndex].w = 4.0f;
debugIndex++;
debugArray[debugIndex].x = temp5[0];
debugArray[debugIndex].y = temp5[1];
debugArray[debugIndex].z = temp5[2];
debugArray[debugIndex].w = 5.0f;
debugIndex++;
debugArray[debugIndex].x = temp6[0];
debugArray[debugIndex].y = temp6[1];
debugArray[debugIndex].z = temp6[2];
debugArray[debugIndex].w = 6.0f;
debugIndex++;
debugArray[debugIndex].x = temp14[0];
debugArray[debugIndex].y = temp14[1];
debugArray[debugIndex].z = temp14[2];
debugArray[debugIndex].w = 14.0f;
debugIndex++;
debugArray[debugIndex].x = temp7[0];
debugArray[debugIndex].y = temp7[1];
debugArray[debugIndex].z = temp7[2];
debugArray[debugIndex].w = 7.0f;
debugIndex++;
debugArray[debugIndex].x = temp8[0];
debugArray[debugIndex].y = temp8[1];
debugArray[debugIndex].z = temp8[2];
debugArray[debugIndex].w = 8.0f;
debugIndex++;
debugArray[debugIndex].x = rr3;
debugArray[debugIndex].y = gf3;
debugArray[debugIndex].z = gf6;
debugArray[debugIndex].w = 20.0f;
debugIndex++;
debugArray[debugIndex].x = gf4;
debugArray[debugIndex].y = gf7;
debugArray[debugIndex].z = 0.0f;
debugArray[debugIndex].w = 21.0f;
debugIndex++;
debugArray[debugIndex].x = atomJ.labFrameDipole[0];
debugArray[debugIndex].y = atomJ.labFrameDipole[1];
debugArray[debugIndex].z = atomJ.labFrameDipole[2];
debugArray[debugIndex].w = 22.0f;
debugIndex++;
debugArray[debugIndex].x = deltaR[0];
debugArray[debugIndex].y = deltaR[1];
debugArray[debugIndex].z = deltaR[2];
debugArray[debugIndex].w = 23.0f;
*/
}
#endif
outputForce->x = -(ftm2_0 + ftm2i_0); outputForce->x = -(ftm2_0 + ftm2i_0);
outputForce->y = -(ftm2_1 + ftm2i_1); outputForce->y = -(ftm2_1 + ftm2i_1);
outputForce->z = -(ftm2_2 + ftm2i_2); outputForce->z = -(ftm2_2 + ftm2i_2);
...@@ -636,50 +466,124 @@ int debugIndex = 0; ...@@ -636,50 +466,124 @@ int debugIndex = 0;
return; return;
} }
#endif
static __device__ void loadElectrostaticParticle( struct ElectrostaticParticle* sA, unsigned int atomI ){
__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
// coordinates & charge // coordinates & charge
sA->x = atomCoord[atomI].x; sA->x = cSim.pPosq[atomI].x;
sA->y = atomCoord[atomI].y; sA->y = cSim.pPosq[atomI].y;
sA->z = atomCoord[atomI].z; sA->z = cSim.pPosq[atomI].z;
sA->q = atomCoord[atomI].w; sA->q = cSim.pPosq[atomI].w;
// lab dipole // lab dipole
sA->labFrameDipole[0] = labFrameDipoleJ[atomI*3]; sA->labFrameDipole[0] = cAmoebaSim.pLabFrameDipole[atomI*3];
sA->labFrameDipole[1] = labFrameDipoleJ[atomI*3+1]; sA->labFrameDipole[1] = cAmoebaSim.pLabFrameDipole[atomI*3+1];
sA->labFrameDipole[2] = labFrameDipoleJ[atomI*3+2]; sA->labFrameDipole[2] = cAmoebaSim.pLabFrameDipole[atomI*3+2];
// lab quadrupole // lab quadrupole
sA->labFrameQuadrupole[0] = labQuadrupole[atomI*9]; sA->labFrameQuadrupole[0] = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
sA->labFrameQuadrupole[1] = labQuadrupole[atomI*9+1]; sA->labFrameQuadrupole[1] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
sA->labFrameQuadrupole[2] = labQuadrupole[atomI*9+2]; sA->labFrameQuadrupole[2] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
sA->labFrameQuadrupole[3] = labQuadrupole[atomI*9+3]; sA->labFrameQuadrupole[3] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
sA->labFrameQuadrupole[4] = labQuadrupole[atomI*9+4]; sA->labFrameQuadrupole[4] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[5] = labQuadrupole[atomI*9+5]; sA->labFrameQuadrupole[5] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[6] = labQuadrupole[atomI*9+6]; sA->labFrameQuadrupole[6] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
sA->labFrameQuadrupole[7] = labQuadrupole[atomI*9+7]; sA->labFrameQuadrupole[7] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
sA->labFrameQuadrupole[8] = labQuadrupole[atomI*9+8]; sA->labFrameQuadrupole[8] = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
// induced dipole // induced dipole
sA->inducedDipole[0] = inducedDipole[atomI*3]; sA->inducedDipole[0] = cAmoebaSim.pInducedDipole[atomI*3];
sA->inducedDipole[1] = inducedDipole[atomI*3+1]; sA->inducedDipole[1] = cAmoebaSim.pInducedDipole[atomI*3+1];
sA->inducedDipole[2] = inducedDipole[atomI*3+2]; sA->inducedDipole[2] = cAmoebaSim.pInducedDipole[atomI*3+2];
// induced dipole polar // induced dipole polar
sA->inducedDipoleP[0] = inducedDipolePolar[atomI*3]; sA->inducedDipoleP[0] = cAmoebaSim.pInducedDipolePolar[atomI*3];
sA->inducedDipoleP[1] = inducedDipolePolar[atomI*3+1]; sA->inducedDipoleP[1] = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
sA->inducedDipoleP[2] = inducedDipolePolar[atomI*3+2]; sA->inducedDipoleP[2] = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
sA->damp = cAmoebaSim.pDampingFactorAndThole[atomI].x;
sA->thole = cAmoebaSim.pDampingFactorAndThole[atomI].y;
}
static __device__ void zeroElectrostaticParticle( struct ElectrostaticParticle* sA ){
sA->damp = dampingFactorAndThole[atomI].x; // coordinates & charge
sA->thole = dampingFactorAndThole[atomI].y;
sA->force[0] = 0.0f;
sA->force[1] = 0.0f;
sA->force[2] = 0.0f;
/*
sA->torque[0] = 0.0f;
sA->torque[1] = 0.0f;
sA->torque[2] = 0.0f;
*/
}
#undef SUB_METHOD_NAME
#undef F1
#define SUB_METHOD_NAME(a, b) a##F1##b
#define F1
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef F1
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef F2
#define SUB_METHOD_NAME(a, b) a##F2##b
#define F2
//#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef F2
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T1
#define SUB_METHOD_NAME(a, b) a##T1##b
#define T1
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef T1
#undef SUB_METHOD_NAME
#undef SUB_METHOD_NAME
#undef T3
#define SUB_METHOD_NAME(a, b) a##T3##b
#define T3
#include "kCalculateAmoebaCudaElectrostatic_b.h"
#undef T3
#undef SUB_METHOD_NAME
__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI, ElectrostaticParticle& atomJ,
float* scalingFactors, float4* outputForce, float4 outputTorque[2], float forceFactor){
#ifdef Orig
return calculateElectrostaticPairIxn_kernel( atomI, atomJ, scalingFactors, outputForce, outputTorque);
#else
float force[3];
float energy;
calculateElectrostaticPairIxnF1_kernel( atomI, atomJ, scalingFactors, &energy, force);
outputForce->x = force[0];
outputForce->y = force[1];
outputForce->z = force[2];
outputForce->w = energy;
calculateElectrostaticPairIxnT1_kernel( atomI, atomJ, scalingFactors, force);
outputTorque[0].x = force[0];
outputTorque[0].y = force[1];
outputTorque[0].z = force[2];
calculateElectrostaticPairIxnT3_kernel( atomI, atomJ, scalingFactors, force);
outputTorque[1].x = force[0];
outputTorque[1].y = force[1];
outputTorque[1].z = force[2];
return;
#endif
} }
...@@ -754,7 +658,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo ...@@ -754,7 +658,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
maxThreads = 384; //maxThreads = 384;
maxThreads = 512;
else if (gpu->sm_version >= SM_12) else if (gpu->sm_version >= SM_12)
maxThreads = 128; maxThreads = 128;
else else
...@@ -773,53 +678,39 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo ...@@ -773,53 +678,39 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else { } else {
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
if( 0 ){
VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat( amoebaGpu->psWorkArray_3_1, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
delete temp;
}
if( addTorqueToForce ){ if( addTorqueToForce ){
kReduceTorque( amoebaGpu ); kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque ); cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
} }
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
}
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
} }
...@@ -85,13 +85,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -85,13 +85,10 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
// N2 debug array // N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(10*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
// print intermediate results for the targetAtom // print intermediate results for the targetAtom
unsigned int targetAtom = 3; unsigned int targetAtom = 3;
...@@ -201,6 +198,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -201,6 +198,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" ); (void) fprintf( amoebaGpu->log, "EFields End\n" );
/*
(void) fprintf( amoebaGpu->log, "DebugQ\n" ); (void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download(); debugArray->Download();
if( 0 ){ if( 0 ){
...@@ -256,23 +254,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -256,23 +254,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
sum[1][1] += debugArray->_pSysData[debugIndex].y; sum[1][1] += debugArray->_pSysData[debugIndex].y;
sum[1][2] += debugArray->_pSysData[debugIndex].z; sum[1][2] += debugArray->_pSysData[debugIndex].z;
/*
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += amoebaGpu->paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"atmJ[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z );
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
*/
} }
(void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", (void) fprintf( amoebaGpu->log,"SumQ [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
sum[0][0], sum[0][1], sum[0][2], sum[0][0], sum[0][1], sum[0][2],
...@@ -301,10 +282,11 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -301,10 +282,11 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
debugArray->_pSysData[ii].w ); debugArray->_pSysData[ii].w );
} }
} }
*/
// write results to file // write results to file
if( 1 ){ if( 0 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
...@@ -314,7 +296,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -314,7 +296,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
} }
delete debugArray; //delete debugArray;
} }
#endif #endif
......
#ifndef AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
#define AMOEBA_CUDA_KIRKWOOD_PARTICLE_H
struct KirkwoodEDiffParticle {
// coordinates charge
float x;
float y;
float z;
float q;
// scaling factor
float thole;
float damp;
// lab frame dipole
float labFrameDipole[3];
// lab frame quadrupole
float labFrameQuadrupole_XX;
float labFrameQuadrupole_XY;
float labFrameQuadrupole_XZ;
float labFrameQuadrupole_YY;
float labFrameQuadrupole_YZ;
float labFrameQuadrupole_ZZ;
// induced dipole and polar counterpart
float inducedDipole[3];
float inducedDipoleP[3];
// solvent induced dipole and polar counterpart
float inducedDipoleS[3];
float inducedDipolePS[3];
// Born radii
float force[3];
// float torque[3];
};
#endif
...@@ -36,55 +36,13 @@ struct KirkwoodParticle { ...@@ -36,55 +36,13 @@ struct KirkwoodParticle {
float bornRadius; float bornRadius;
float force[3]; float force[3];
#ifdef INCLUDE_TORQUE
float torque[3]; float torque[3];
#endif
float dBornRadius; float dBornRadius;
float dBornRadiusPolar; float dBornRadiusPolar;
float padding; // float padding;
};
struct KirkwoodEDiffParticle {
// coordinates charge
float x;
float y;
float z;
float q;
// scaling factor
float thole;
float damp;
// lab frame dipole
float labFrameDipole[3];
// lab frame quadrupole
float labFrameQuadrupole_XX;
float labFrameQuadrupole_XY;
float labFrameQuadrupole_XZ;
float labFrameQuadrupole_YY;
float labFrameQuadrupole_YZ;
float labFrameQuadrupole_ZZ;
// induced dipole and polar counterpart
float inducedDipole[3];
float inducedDipoleP[3];
// solvent induced dipole and polar counterpart
float inducedDipoleS[3];
float inducedDipolePS[3];
// Born radii
float force[3];
float torque[3];
}; };
......
...@@ -67,6 +67,80 @@ __device__ static void loadMappedTorque( int particleId, int bufferIndex, float* ...@@ -67,6 +67,80 @@ __device__ static void loadMappedTorque( int particleId, int bufferIndex, float*
} }
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void amoebaAddMapTorqueForceToForce_kernel( void )
{
// ---------------------------------------------------------------------------------------
int pos = blockIdx.x*blockDim.x + threadIdx.x;
// ---------------------------------------------------------------------------------------
while (pos < cSim.stride4 )
{
float totalForce = 0.0f;
float* pFt = (float*)cAmoebaSim.pTorqueMapForce4 + pos;
int i = cAmoebaSim.maxTorqueBufferIndex;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
float f3 = *pFt;
pFt += cSim.stride4;
float f4 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
pFt = (float*)cSim.pForce4 + pos;
*pFt += totalForce;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void amoebaClearMapTorqueForce_kernel( void )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while (pos < cSim.stride4*cAmoebaSim.maxTorqueBufferIndex )
{
cAmoebaSim.pTorqueMapForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x;
}
}
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
...@@ -359,7 +433,77 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt ...@@ -359,7 +433,77 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
{ {
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > amoebaGpu->gpuContext->sim.outputBuffers && amoebaGpu->psTorqueMapForce4 != amoebaGpu->gpuContext->psForce4 && amoebaGpu->psTorqueMapForce4 ){
amoebaClearMapTorqueForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
LAUNCHERROR("amoebaClearMapTorqueForce");
}
if( 0 ){
VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat( psTorque, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPreTorqueForce", fileId, outputVector );
delete temp;
}
amoebaMapTorqueToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( psTorque->_pDevData ); amoebaMapTorqueToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( psTorque->_pDevData );
LAUNCHERROR("amoebaMapTorqueToForce"); LAUNCHERROR("amoebaMapTorqueToForce");
if( amoebaGpu->amoebaSim.maxTorqueBufferIndex > amoebaGpu->gpuContext->sim.outputBuffers && amoebaGpu->psTorqueMapForce4 != amoebaGpu->gpuContext->psForce4 && amoebaGpu->psTorqueMapForce4 ){
amoebaAddMapTorqueForceToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
LAUNCHERROR("amoebaAddMapTorqueForceToForce");
}
#ifdef AMOEBA_DEBUG
if( 0 ){
VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "Temp1");
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
for( int pId = 0; pId < 5; pId++ ){
float sum[3] = { 0.0f, 0.0f, 0.0f };
(void) fprintf( stderr, "\n\nTorqueForceToForce for part=%d\n", pId );
for( int ii = 0; ii < amoebaGpu->amoebaSim.maxTorqueBufferIndex; ii++ ){
(void) fprintf( stderr, "%4d [%15.7e %15.7e %15.7e]\n", ii,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z );
sum[0] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x;
sum[1] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y;
sum[2] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z;
}
(void) fprintf( stderr, "TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e]\n", pId, sum[0], sum[1], sum[2], sum[0]/4.184f, sum[1]/4.184f, sum[2]/4.184f );
}
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPostTorqueForce", fileId, outputVector );
delete temp;
}
#endif
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment