Commit 7eaa5d29 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

WCA force was overwriting total force vector

Modified out-of-plane bend force to improve stability
Redid arguments to kCalculateAmoebaCudaKirkwood to reduce lmem
parent 98b09e6f
......@@ -500,6 +500,7 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
// multipoles
kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
//kClearForces(gpu->gpuContext);
//kClearEnergy(gpu->gpuContext);
//(void) fprintf( data.getLog(), "computeAmoebaMultipoleForce clearing forces/energy after kCalculateAmoebaMultipoleForces()\n" );
......
......@@ -1197,7 +1197,10 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
// output
amoebaGpu->psLabFrameDipole = new CUDAStream<float>(3*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameDipole");
amoebaGpu->amoebaSim.pLabFrameDipole = amoebaGpu->psLabFrameDipole->_pDevStream[0];
amoebaGpu->psLabFrameQuadrupole = new CUDAStream<float>(9*amoebaGpu->paddedNumberOfAtoms, 1, "LabFrameQuadrupole");
amoebaGpu->amoebaSim.pLabFrameQuadrupole = amoebaGpu->psLabFrameQuadrupole->_pDevStream[0];
memset( amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0], 0, sizeof(int)*4*amoebaGpu->paddedNumberOfAtoms );
}
......@@ -1275,7 +1278,11 @@ void gpuMutualInducedFieldAllocate( amoebaGpuContext amoebaGpu )
#endif
amoebaGpu->psInducedDipole = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipole");
amoebaGpu->amoebaSim.pInducedDipole = amoebaGpu->psInducedDipole->_pDevStream[0];
amoebaGpu->psInducedDipolePolar = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolar");
amoebaGpu->amoebaSim.pInducedDipolePolar = amoebaGpu->psInducedDipolePolar->_pDevStream[0];
amoebaGpu->psCurrentEpsilon = new CUDAStream<float>(5, 1, "CurrentEpsilon");
amoebaGpu->epsilonThreadsPerBlock = 384;
......@@ -1326,19 +1333,15 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
}
#endif
amoebaGpu->psForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceuElectrostatic");
amoebaGpu->psTorque = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ForceTorque");
amoebaGpu->psForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "ElectrostaticForce");
amoebaGpu->psTorque = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Torque");
unsigned int offset = 3*paddedNumberOfAtoms*sizeof( float );
memset( amoebaGpu->psForce->_pSysStream[0], 0, offset );
memset( amoebaGpu->psTorque->_pSysStream[0], 0, offset );
offset = paddedNumberOfAtoms*sizeof( float );
// memset( amoebaGpu->psEnergy->_pSysStream[0], 0, offset );
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download();
// amoebaGpu->psEnergy->Download();
}
......@@ -1363,22 +1366,27 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )
return;
}
int paddedNumberOfAtoms = amoebaGpu->paddedNumberOfAtoms;
int paddedNumberOfAtoms = amoebaGpu->paddedNumberOfAtoms;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms=%d\n", methodName.c_str(), paddedNumberOfAtoms );
(void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms =%d\n", methodName.c_str(), paddedNumberOfAtoms );
(void) fflush( amoebaGpu->log );
}
#endif
amoebaGpu->psBorn = new CUDAStream<float>(paddedNumberOfAtoms, 1, "KirkwoodBorn");
amoebaGpu->psBornPolar = new CUDAStream<float>(paddedNumberOfAtoms, 1, "KirkwoodBornPolar");
amoebaGpu->psGk_Field = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
amoebaGpu->psInducedDipoleS = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipoleS");
amoebaGpu->psInducedDipolePolarS = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolarS");
amoebaGpu->psKirkwoodForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodForce");
amoebaGpu->psKirkwoodEDiffForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodEDiffForce");
amoebaGpu->psBorn = new CUDAStream<float>(paddedNumberOfAtoms, 1, "KirkwoodBorn");
amoebaGpu->psBornPolar = new CUDAStream<float>(paddedNumberOfAtoms, 1, "KirkwoodBornPolar");
amoebaGpu->psGk_Field = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
amoebaGpu->psInducedDipoleS = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipoleS");
amoebaGpu->amoebaSim.pInducedDipoleS = amoebaGpu->psInducedDipoleS->_pDevStream[0];
amoebaGpu->psInducedDipolePolarS = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "InducedDipolePolarS");
amoebaGpu->amoebaSim.pInducedDipolePolarS = amoebaGpu->psInducedDipolePolarS->_pDevStream[0];
amoebaGpu->psKirkwoodForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodForce");
amoebaGpu->psKirkwoodEDiffForce = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "KirkwoodEDiffForce");
unsigned int offset = paddedNumberOfAtoms*sizeof( float );
memset( amoebaGpu->psBorn->_pSysStream[0], 0, offset );
......@@ -2685,7 +2693,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
(void) fflush( amoebaGpu->log );
}
amoebaGpu->psWorkArray_3_1 = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_1");
amoebaGpu->amoebaSim.pWorkArray_3_1 = amoebaGpu->psWorkArray_3_1->_pDevStream[0];
amoebaGpu->psWorkArray_3_2 = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_2");
amoebaGpu->amoebaSim.pWorkArray_3_2 = amoebaGpu->psWorkArray_3_2->_pDevStream[0];
// used GK
amoebaGpu->psWorkArray_3_3 = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_3");
......@@ -2694,7 +2705,10 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
amoebaGpu->psWorkArray_3_6 = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_6");
amoebaGpu->psWorkArray_1_1 = new CUDAStream<float>( paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_1");
amoebaGpu->amoebaSim.pWorkArray_1_1 = amoebaGpu->psWorkArray_1_1->_pDevStream[0];
amoebaGpu->psWorkArray_1_2 = new CUDAStream<float>( paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_2");
amoebaGpu->amoebaSim.pWorkArray_1_2 = amoebaGpu->psWorkArray_1_2->_pDevStream[0];
amoebaGpu->psEnergy = new CUDAStream<float>(amoebaGpu->energyOutputBuffers, 1, "AmoebaEnergy");
......
......@@ -41,231 +41,6 @@
#include <builtin_types.h>
#include <vector_functions.h>
#if 0
#define RTERROR(status, s) \
if (status != cudaSuccess) { \
printf("%s %s\n", s, cudaGetErrorString(status)); \
exit(-1); \
}
#define LAUNCHERROR(s) \
{ \
cudaError_t status = cudaGetLastError(); \
if (status != cudaSuccess) { \
printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
exit(-1); \
} \
}
#endif
#if 0
// Pure virtual class to define an interface for objects resident both on GPU and CPU
struct SoADeviceObject {
virtual void Allocate() = 0;
virtual void Deallocate() = 0;
virtual void Upload() = 0;
virtual void Download() = 0;
};
template <typename T>
struct CUDAStream : public SoADeviceObject
{
unsigned int _length;
unsigned int _subStreams;
unsigned int _stride;
T** _pSysStream;
T** _pDevStream;
T* _pSysData;
T* _pDevData;
std::string _name;
CUDAStream(int length, int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
virtual ~CUDAStream();
void Allocate();
void Deallocate();
void Upload();
void Download();
void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
T& operator[](int index);
};
float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
template <typename T>
CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::~CUDAStream()
{
Deallocate();
}
template <typename T>
void CUDAStream<T>::Allocate()
{
cudaError_t status;
_pSysStream = new T*[_subStreams];
_pDevStream = new T*[_subStreams];
_pSysData = new T[_subStreams * _stride];
status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
for (unsigned int i = 0; i < _subStreams; i++)
{
_pSysStream[i] = _pSysData + i * _stride;
_pDevStream[i] = _pDevData + i * _stride;
}
}
template <typename T>
void CUDAStream<T>::Deallocate()
{
cudaError_t status;
delete[] _pSysStream;
_pSysStream = NULL;
delete[] _pDevStream;
_pDevStream = NULL;
delete[] _pSysData;
_pSysData = NULL;
status = cudaFree(_pDevData);
RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
}
template <typename T>
void CUDAStream<T>::Upload()
{
cudaError_t status;
status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
}
template <typename T>
void CUDAStream<T>::Download()
{
cudaError_t status;
status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
}
template <typename T>
void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
{
T* pTemp = new T[_subStreams * _stride];
unsigned int stream = 0;
unsigned int pos = 0;
unsigned int newstride = _stride * _subStreams / newstreams;
unsigned int newlength = _length * _subStreams / newstreams;
// Copy data into new format
for (unsigned int i = 0; i < _length; i++)
{
for (unsigned int j = 0; j < _subStreams; j++)
{
pTemp[stream * newstride + pos] = _pSysStream[j][i];
stream++;
if (stream == newstreams)
{
stream = 0;
pos++;
}
}
}
// Remap stream pointers;
for (unsigned int i = 0; i < newstreams; i++)
{
_pSysStream[i] = _pSysData + i * newstride;
_pDevStream[i] = _pDevData + i * newstride;
}
// Copy data back intro original stream
for (unsigned int i = 0; i < newlength; i++)
for (unsigned int j = 0; j < newstreams; j++)
_pSysStream[j][i] = pTemp[j * newstride + i];
_stride = newstride;
_length = newlength;
_subStreams = newstreams;
delete[] pTemp;
}
template <typename T>
T& CUDAStream<T>::operator[](int index)
{
return _pSysData[index];
}
static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256;
static const int G8X_UPDATE_THREADS_PER_BLOCK = 192;
static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384;
static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192;
static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384;
static const int G8X_THREADS_PER_BLOCK = 256;
static const int GT2XX_THREADS_PER_BLOCK = 256;
static const int G8X_RANDOM_THREADS_PER_BLOCK = 256;
static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
static const float PI = 3.14159265358979323846f;
static const int PME_ORDER = 4;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC,
EWALD,
PARTICLE_MESH_EWALD
};
enum ExpressionOp {
CONSTANT = 0, VARIABLE0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, GLOBAL, CUSTOM, CUSTOM_DERIV, ADD, SUBTRACT, MULTIPLY, DIVIDE,
POWER, NEGATE, SQRT, EXP, LOG, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SQUARE, CUBE, RECIPROCAL, ADD_CONSTANT, MULTIPLY_CONSTANT, POWER_CONSTANT
};
template<int SIZE>
struct Expression {
int op[SIZE];
float arg[SIZE];
int length, stackSize;
};
#endif
struct cudaAmoebaGmxSimulation {
// Constants
......@@ -344,6 +119,19 @@ struct cudaAmoebaGmxSimulation {
float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors
float* pLabFrameDipole;
float* pLabFrameQuadrupole;
float* pInducedDipole;
float* pInducedDipolePolar;
float* pInducedDipoleS;
float* pInducedDipolePolarS;
float* pWorkArray_3_1;
float* pWorkArray_3_2;
float* pWorkArray_1_1;
float* pWorkArray_1_2;
unsigned int amoebaVdwNonReductions;
int* pAmoebaVdwNonReductionID;
......
......@@ -867,11 +867,11 @@ static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
LAUNCHERROR("kReduceForceTorque1");
LAUNCHERROR("kReduceElectrostaticForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
LAUNCHERROR("kReduceForceTorque2");
LAUNCHERROR("kReduceElectrostaticTorque");
}
#ifdef AMOEBA_DEBUG
......@@ -1137,10 +1137,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector);
(void) fprintf( amoebaGpu->log, "%s calling cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
(void) fprintf( amoebaGpu->log, "%s called cudaWriteVectorOfDoubleVectorsToFile \n", methodName ); fflush( amoebaGpu->log );
}
}
......
......@@ -2386,21 +2386,11 @@ threadsPerBlock = 32;
#endif
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
amoebaGpu->psInducedDipoleS->_pDevStream[0],
amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
gpu->psBornRadii->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
amoebaGpu->psWorkArray_1_1->_pDevStream[0],
amoebaGpu->psWorkUnit->_pDevStream[0]
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_1_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
, debugArray->_pDevStream[0], targetAtom );
#else
amoebaGpu->psWorkArray_1_2->_pDevStream[0] );
);
#endif
}
......
......@@ -37,18 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
*/
void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int* workUnit,
float4* atomCoord,
float* labFrameDipole,
float* labFrameQuadrupole,
float* inducedDipole,
float* inducedDipolePolar,
float* bornRadii,
float* outputForce,
float* outputTorque,
float* output_dBornRadius,
float* output_dBornRadiusPolar
unsigned int* workUnit
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
......@@ -67,6 +56,12 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
// pWorkArray_3_1 == force
// pWorkArray_3_2 == torque
// pWorkArray_1_1 == dBorn
// pWorkArray_1_2 == dBornPolar
float4 jCoord;
float jDipole[3];
float jQuadrupole[9];
......@@ -94,7 +89,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
KirkwoodParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
float4 iCoord = atomCoord[atomI];
float4 iCoord = cSim.pPosq[atomI];
float forceSum[3];
float torqueSum[3];
......@@ -120,8 +115,8 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
// load shared data
loadKirkwoodShared( &(sA[threadIdx.x]), atomI,
atomCoord, labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar, bornRadii );
cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
cAmoebaSim.pInducedDipoleS, cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii );
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
......@@ -145,11 +140,11 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
calculateKirkwoodPairIxn_kernel( sameAtom,
iCoord, jCoord,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
bornRadii[atomI], jBornRadius,
&(cAmoebaSim.pLabFrameDipole[3*atomI]), jDipole,
&(cAmoebaSim.pLabFrameQuadrupole[9*atomI]), jQuadrupole,
&(cAmoebaSim.pInducedDipoleS[3*atomI]), jInducedDipole,
&(cAmoebaSim.pInducedDipolePolarS[3*atomI]),jInducedDipolePolar,
cSim.pBornRadii[atomI], jBornRadius,
force, torque, dBorn, dBornPolar, &energy
#ifdef AMOEBA_DEBUG
, pullBack
......@@ -188,7 +183,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) (mask + 1);
//debugArray[index].z = bornRadii[atomI];
//debugArray[index].z = cSim.pBornRadii[atomI];
//debugArray[index].z = energy;
//debugArray[index].w = (float) (blockIdx.x*blockDim.x+threadIdx.x);
debugArray[index].w = jBornRadius;
......@@ -234,30 +229,30 @@ if( atomI == targetAtom || atomJ == targetAtom ){
float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
of = output_dBornRadius[offset];
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
output_dBornRadius[offset] = of;
cAmoebaSim.pWorkArray_1_1[offset] = of;
of = output_dBornRadiusPolar[offset];
of = cAmoebaSim.pWorkArray_1_2[offset];
of += dBornPolarSum;
output_dBornRadiusPolar[offset] = of;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, forceSum, outputForce );
load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
load3dArrayBufferPerWarp( offset, forceSum, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
output_dBornRadius[offset] = dBornSum;
output_dBornRadiusPolar[offset] = dBornPolarSum;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3;
load3dArray( offset, forceSum, outputForce );
load3dArray( offset, torqueSum, outputTorque );
load3dArray( offset, forceSum, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );
#endif
......@@ -271,8 +266,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// load shared data
loadKirkwoodShared( &(sA[threadIdx.x]), (y+tgx),
atomCoord, labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar, bornRadii );
cSim.pPosq, cAmoebaSim.pLabFrameDipole, cAmoebaSim.pLabFrameQuadrupole,
cAmoebaSim.pInducedDipoleS, cAmoebaSim.pInducedDipolePolarS, cSim.pBornRadii);
}
......@@ -299,12 +294,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
jInducedDipole, jInducedDipolePolar, &jBornRadius );
calculateKirkwoodPairIxn_kernel( sameAtom,
iCoord, jCoord,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
bornRadii[atomI], jBornRadius,
iCoord, jCoord,
&(cAmoebaSim.pLabFrameDipole[3*atomI]), jDipole,
&(cAmoebaSim.pLabFrameQuadrupole[9*atomI]), jQuadrupole,
&(cAmoebaSim.pInducedDipoleS[3*atomI]), jInducedDipole,
&(cAmoebaSim.pInducedDipolePolarS[3*atomI]), jInducedDipolePolar,
cSim.pBornRadii[atomI], jBornRadius,
force, torque, dBorn, dBornPolar, &energy
#ifdef AMOEBA_DEBUG
, pullBack
......@@ -350,7 +345,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = (float) (mask+1);
//debugArray[index].z = bornRadii[atomI];
//debugArray[index].z = cSim.pBornRadii[atomI];
//debugArray[index].z = energy;
debugArray[index].w = (float) (blockIdx.x*blockDim.x+threadIdx.x);
......@@ -407,54 +402,54 @@ if( mask || !mask ){
float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
of = output_dBornRadius[offset];
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
output_dBornRadius[offset] = of;
cAmoebaSim.pWorkArray_1_1[offset] = of;
of = output_dBornRadiusPolar[offset];
of = cAmoebaSim.pWorkArray_1_2[offset];
of += dBornPolarSum;
output_dBornRadiusPolar[offset] = of;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, forceSum, outputForce );
load3dArrayBufferPerWarp( offset, torqueSum, outputTorque );
load3dArrayBufferPerWarp( offset, forceSum, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );
offset = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
of = output_dBornRadius[offset];
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
output_dBornRadius[offset] = of;
cAmoebaSim.pWorkArray_1_1[offset] = of;
of = output_dBornRadiusPolar[offset];
of = cAmoebaSim.pWorkArray_1_2[offset];
of += dBornPolarSum;
output_dBornRadiusPolar[offset] = of;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, outputForce );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
output_dBornRadius[offset] = dBornSum;
output_dBornRadiusPolar[offset] = dBornPolarSum;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3;
load3dArray( offset, forceSum, outputForce );
load3dArray( offset, torqueSum, outputTorque );
load3dArray( offset, forceSum, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, torqueSum, cAmoebaSim.pWorkArray_3_2 );
offset = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
output_dBornRadius[offset] = sA[threadIdx.x].dBornRadius;
output_dBornRadiusPolar[offset] = sA[threadIdx.x].dBornRadiusPolar;
cAmoebaSim.pWorkArray_1_1[offset] = sA[threadIdx.x].dBornRadius;
cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar;
offset *= 3;
load3dArray( offset, sA[threadIdx.x].force, outputForce );
load3dArray( offset, sA[threadIdx.x].torque, outputTorque );
load3dArray( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
#endif
lasty = y;
......
......@@ -259,7 +259,7 @@ void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int ou
unsigned int j = pos/3;
unsigned int k = pos - 3*j;
fieldOut[4*j+k] = totalField;
fieldOut[4*j+k] += totalField;
pos += gridDim.x * blockDim.x;
}
}
......
......@@ -1165,11 +1165,19 @@ void kCalculateAmoebaLocalForces_kernel()
float dot = xad*xcd + yad*ycd + zad*zcd;
float cc = rad2*rcd2 - dot*dot;
float bkk2 = (cc != 0.0f) ? (rdb2 - ee*ee/cc) : 0.0f;
float cosine = (rdb2 != 0.0f) ? sqrtf(bkk2/rdb2) : 0.0f;
cosine = (cosine > 1.0f) ? 1.0f : cosine;
cosine = (cosine < -1.0f) ? -1.0f : cosine;
float bkk2 = (cc != 0.0f ) ? (ee*ee)/(cc) : 0.0f;
float bkk3 = (rdb2 != 0.0f ) ? bkk2/rdb2 : 0.0f;
bkk2 = rdb2 - bkk2;
float cosine;
if( fabs( bkk3 ) < 0.98f ){
bkk3 = 1.0f - bkk3;
cosine = bkk3 > 0.0f ? sqrtf(bkk3) : 0.0f;
cosine = (cosine > 1.0f) ? 0.0f : acos(cosine);
} else {
cosine = sqrtf(bkk3);
cosine = asin(cosine);
}
/*
c
......@@ -1204,7 +1212,7 @@ c
*/
// find the out-of-plane energy and master chain rule terms
float dt = LOCAL_HACK_RADIAN_D*acos(cosine);
float dt = LOCAL_HACK_RADIAN_D*cosine;
float dt2 = dt * dt;
float dt3 = dt2 * dt;
float dt4 = dt2 * dt2;
......@@ -1215,7 +1223,7 @@ c
(cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
(cAmoebaSim.amoebaOutOfPlaneBendSexticK* dt4) );
float deddt = k*dt*LOCAL_HACK_RADIAN*(2.0f +
float deddt = k*dt*LOCAL_HACK_RADIAN*(2.0f +
(3.0f*cAmoebaSim.amoebaOutOfPlaneBendCubicK* dt ) +
(4.0f*cAmoebaSim.amoebaOutOfPlaneBendQuarticK*dt2) +
(5.0f*cAmoebaSim.amoebaOutOfPlaneBendPenticK* dt3) +
......@@ -1281,11 +1289,6 @@ c
force.x -= dedxia;
force.y -= dedyia;
force.z -= dedzia;
force.x = bkk2;
force.y = rdb2;
force.z = cosine;
force.w = dt;
cSim.pForce4[offset] = force;
offset = atom1.y + atom2.y * cSim.stride;
......
......@@ -3770,6 +3770,15 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
(void) fprintf( log, "CMMotionRemover added w/ frequency=%d at line=%d\n", frequency, lineCount );
}
// All forces
} else if( field == ALL_FORCES ){
readVec3( filePtr, tokens, forces[ALL_FORCES], &lineCount, field, log );
} else if( field == "AllEnergy" ){
if( tokens.size() > 1 ){
potentialEnergy[ALL_FORCES] = atof( tokens[1].c_str() );
}
// AmoebaHarmonicBond
} else if( field == "AmoebaHarmonicBondParameters" ){
......@@ -3876,6 +3885,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
field == "AmoebaGk_A_Force" ||
field == "AmoebaGk_A_DrB" ||
field == "AmoebaDBorn" ||
field == "AmoebaBorn1Force" ||
field == "AmoebaBornForce" ||
field == "AmoebaGkEdiffForceAndTorque" ||
field == "AmoebaGkEdiffForce" ){
......@@ -3885,6 +3895,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
} else if( field == "AmoebaGkEnergy" ||
field == "AmoebaGkEdiffEnergy" ||
field == "AmoebaGk_A_Energy" ||
field == "AmoebaBorn1Energy" ||
field == "AmoebaBornEnergy" ){
double value = atof( tokens[1].c_str() );
std::vector< std::vector<double> > vectorOfDoubleVectors;
......@@ -3948,14 +3959,19 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
double totalPotentialEnergy = 0.0;
if( log )(void) fprintf( log, "Potential energies\n" );
double allEnergy = 0.0;
for( MapStringDoubleI ii = potentialEnergy.begin(); ii != potentialEnergy.end(); ii++ ){
totalPotentialEnergy += ii->second;
if( ii->first == ALL_FORCES ){
allEnergy = ii->second;
} else {
totalPotentialEnergy += ii->second;
}
if( log )(void) fprintf( log, "%30s %14.7e\n", ii->first.c_str(), ii->second );
}
potentialEnergy["AllForces"] = totalPotentialEnergy;
potentialEnergy["SumOfInputEnergies"] = totalPotentialEnergy;
if( log ){
(void) fprintf( log, "Total PE %14.7e\n", totalPotentialEnergy );
(void) fprintf( log, "Total PE %14.7e %14.7e\n", totalPotentialEnergy, allEnergy );
(void) fprintf( log, "Read %d lines from file=<%s>\n", lineCount, inputParameterFile.c_str() );
(void) fflush( log );
}
......@@ -4614,6 +4630,9 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
activeForceNames += ii->first + ":";
}
}
if( forceList.size() >= 11 ){
activeForceNames =ALL_FORCES;
}
std::vector<Vec3> expectedForces;
expectedForces.resize( system.getNumParticles() );
......@@ -4634,7 +4653,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
}
}
int showAll = 0;
int showAll = 1;
double energyConversion;
double forceConversion;
if( useOpenMMUnits ){
......@@ -4650,6 +4669,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
if( log ){
std::vector<FILE*> fileList;
if( log )fileList.push_back( log );
double cutoffDelta = 0.02;
for( unsigned int ii = 0; ii < fileList.size(); ii++ ){
FILE* filePtr = fileList[ii];
(void) fprintf( filePtr, "\n" );
......@@ -4668,9 +4688,10 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
double delta = fabs( normF1 - normF2 );
double sumNorms = 0.5*(normF1 + normF2);
double relativeDelta = sumNorms > 0.0 ? fabs( normF1 - normF2 )/sumNorms : 0.0;
if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) || showAll ){
(void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e] [%14.7e %14.7e %14.7e]\n", ii, relativeDelta, delta,
expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2] );
bool badMatch = (cutoffDelta < relativeDelta) && (sumNorms > 0.1) ? true : false;
if( badMatch || showAll ){
(void) fprintf( filePtr, "%6u %10.3e %10.3e [%14.7e %14.7e %14.7e] [%14.7e %14.7e %14.7e] %s\n", ii, relativeDelta, delta,
expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forceConversion*forces[ii][0], forceConversion*forces[ii][1], forceConversion*forces[ii][2], ( (showAll && badMatch) ? " XXX" : "") );
if( ( (maxRelativeDelta < relativeDelta) && (sumNorms > 0.1)) ){
maxRelativeDelta = relativeDelta;
maxRelativeDeltaIndex = ii;
......@@ -5466,7 +5487,7 @@ int runTestsUsingAmoebaTinkerParameterFile( MapStringString& argumentMap ){
//if( checkEnergyForceConsistency )checkForces = 0;
} else if( key == "log" ){
logControl = atoi( value.c_str() );
} else if( key == "AllForces" ){
} else if( key == ALL_FORCES ){
initializeForceMap( forceMap, 1 );
} else if( key == AMOEBA_HARMONIC_BOND_FORCE ||
key == AMOEBA_HARMONIC_ANGLE_FORCE ||
......
......@@ -76,6 +76,7 @@ static std::string AMOEBA_GK_FORCE = "AmoebaG
static std::string AMOEBA_VDW_FORCE = "AmoebaVdw";
static std::string AMOEBA_WCA_DISPERSION_FORCE = "AmoebaWcaDispersion";
static std::string AMOEBA_SASA_FORCE = "AmoebaSASA";
static std::string ALL_FORCES = "AllForces";
static std::string AMOEBA_MULTIPOLE_ROTATION_MATRICES = "AmoebaMultipoleRotationMatrices";
static std::string AMOEBA_MULTIPOLE_ROTATED = "AmoebaMultipolesRotated";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment