Commit 761d7e17 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of limitation for 'long-range in sequence' covalent bonds

Reduced memory footprint
parent 80c4976e
......@@ -112,8 +112,15 @@ void AmoebaCudaData::initializeGpu( void ) {
if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
throw OpenMMException("GK force requires Multipole force\n");
}
amoebaGpuBuildOutputBuffers( amoebaGpu, getHasAmoebaGeneralizedKirkwood() );
amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
amoebaGpuBuildVdwExclusionList( amoebaGpu );
amoebaGpuBuildScalingList( amoebaGpu );
amoebaGpuSetConstants( amoebaGpu );
gpuInitialized = true;
if( log ){
gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
(void) fprintf( log, "Gpu initialized\n" );
......
......@@ -93,7 +93,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
if( mapIterator == contextToAmoebaDataMap.end() ){
amoebaCudaData = new AmoebaCudaData( cudaPlatformData );
contextToAmoebaDataMap[&context] = amoebaCudaData;
//amoebaCudaData->setLog( stderr );
amoebaCudaData->setLog( stderr );
amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
} else {
amoebaCudaData = mapIterator->second;
......
......@@ -99,7 +99,7 @@ extern void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext a
extern void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext gpu);
extern void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu );
extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, char* fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1,
extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1,
int entriesPerAtom2, CUDAStream<float>* array2 );
extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
......@@ -112,9 +112,7 @@ extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );
extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce);
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce, CUDAStream<float4>* psOutputForce);
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
extern void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque );
extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
......@@ -143,14 +141,14 @@ extern void cudaReduceN2ToN( float *N2Array, int N, float *NArray, int includeD
extern float cudaGetSum( int numberOfElements, CUDAStream<float>* array );
extern float cudaGetNorm2( int numberOfElements, CUDAStream<float>* array );
extern int checkForNansAndInfinities( int numberOfElements, CUDAStream<float>* array );
extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, char* fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1,
extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1,
int entriesPerAtom2, CUDAStream<float>* array2 );
extern void readFile( std::string fileName, StringVectorVector& fileContents );
extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, float conversion );
extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
extern void cudaWriteVectorOfDoubleVectorsToFile( const std::string& fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
extern void checkForNans( int numberOfParticles, int entriesPerParticle,
CUDAStream<float>* array, int* order, int iteration, std::string idString, FILE* log );
......@@ -166,6 +164,8 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
//extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
// PME
......
......@@ -50,6 +50,7 @@ enum CudaAmoebaNonbondedMethod
static const int AMOEBA_PME_ORDER = 5;
struct cudaAmoebaGmxSimulation {
// Constants
unsigned int amoebaBonds; // Number of bonds
......@@ -132,19 +133,20 @@ struct cudaAmoebaGmxSimulation {
float amoebaUreyBradleyQuarticicParameter; // quartic parameter
unsigned int amoebaUreyBradley_offset; // Offset to end of bonds
unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms
//float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI)
float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors
int4* pMultipoleParticlesIdsAndAxisType;
int* pMultipoleAxisOffset;
int4* pMultipoleParticlesTorqueBufferIndices;
float4* pTorqueMapForce4;
float* pMolecularDipole;
float* pMolecularQuadrupole;
float* pLabFrameDipole;
float* pLabFrameQuadrupole;
float* pInducedDipole;
float* pInducedDipolePolar;
......@@ -171,6 +173,7 @@ struct cudaAmoebaGmxSimulation {
int* pVdwExclusionIndices;
// WCA constants
float epso;
float epsh;
float rmino;
......@@ -180,6 +183,7 @@ struct cudaAmoebaGmxSimulation {
float dispoff;
float totalMaxWcaDispersionEnergy;
// scaling indices
int* pScaleIndicesIndex;
int* pD_ScaleIndices;
......
......@@ -33,33 +33,31 @@
#define THREADS_PER_BLOCK 256
#include <map>
//using namespace std;
typedef std::map<int,float> MapIntFloat;
//typedef MapIntFloat::iterator MapIntFloatI;
typedef MapIntFloat::const_iterator MapIntFloatCI;
/* Pointer to this structure will be given
* to gromacs functions*/
/*
* Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct _amoebaGpuContext {
_gpuContext* gpuContext;
FILE* log;
// diagnostic arrays
MapIntFloat** pMapArray;
MapIntFloat** dMapArray;
bool bOutputBufferPerWarp;
unsigned int paddedNumberOfAtoms;
unsigned int nonbondBlocks;
unsigned int nonbondThreadsPerBlock;
unsigned int nonbondOutputBuffers;
unsigned int threadsPerBlock;
unsigned int fieldReduceThreadsPerBlock;
unsigned int outputBuffers;
//bool bOutputBufferPerWarp;
//unsigned int paddedNumberOfAtoms;
//unsigned int nonbondBlocks;
//unsigned int nonbondThreadsPerBlock;
//unsigned int nonbondOutputBuffers;
//unsigned int threadsPerBlock;
//unsigned int fieldReduceThreadsPerBlock;
//unsigned int outputBuffers;
unsigned int workUnits;
// workspace arrays
......@@ -120,14 +118,19 @@ struct _amoebaGpuContext {
float solventDielectric;
// rotation matrix
CUDAStream<float>* psRotationMatrix;
// multipole parameters
CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
CUDAStream<int>* psMultipoleAxisOffset;
// buffer indices used for mapping torques onto forces
int maxTorqueBufferIndex;
int useNewTorqueMapScheme;
int torqueMapForce4Delete;
CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
CUDAStream<float4>* psTorqueMapForce4;
CUDAStream<float>* psMolecularDipole;
CUDAStream<float>* psMolecularQuadrupole;
......@@ -175,11 +178,7 @@ struct _amoebaGpuContext {
// electrostatic
CUDAStream<float>* psForce;
CUDAStream<float>* psTorque;
CUDAStream<float>* torqueMapForce;
int maxMapTorqueDifference;
int maxMapTorqueDifferencePow2;
// Kirkwood fields
......@@ -188,8 +187,6 @@ struct _amoebaGpuContext {
CUDAStream<float>* psInducedDipolePolarS;
CUDAStream<float>* psBorn;
CUDAStream<float>* psBornPolar;
CUDAStream<float>* psKirkwoodForce;
CUDAStream<float>* psKirkwoodEDiffForce;
int includeObcCavityTerm;
......@@ -208,6 +205,7 @@ struct _amoebaGpuContext {
int vdwSigmaCombiningRule;
int vdwEpsilonCombiningRule;
std::vector< std::vector<int> > vdwExclusions;
// Wca dispersion fields
......@@ -239,7 +237,7 @@ extern "C"
void amoebaGpuShutDown(amoebaGpuContext gpu);
extern "C"
void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu );
void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu, int hasKirkwood );
extern "C"
int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext gpu );
......@@ -327,7 +325,7 @@ extern "C"
void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
extern "C"
void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu, const std::vector< std::vector<int> >& exclusions );
void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu );
extern "C"
void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
......
......@@ -101,6 +101,30 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for
}
__device__ static void add3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
{
float4 of;
of = outputForce[offset];
of.x += forceSum[0];
of.y += forceSum[1];
of.z += forceSum[2];
outputForce[offset] = of;
}
__device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
{
float4 of;
of.x = forceSum[0];
of.y = forceSum[1];
of.z = forceSum[2];
of.w = 0.0f;
outputForce[offset] = of;
}
__device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce )
{
......@@ -110,6 +134,15 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float*
}
__device__ static void add3dArray( unsigned int offset, float* forceSum, float* outputForce )
{
outputForce[offset] += forceSum[0];
outputForce[offset+1] += forceSum[1];
outputForce[offset+2] += forceSum[2];
}
__device__ static void scale3dArray( float scaleFactor, float* force )
{
......
......@@ -698,22 +698,17 @@ __device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsig
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaElectrostatic.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
// reduce psWorkArray_3_1 -> torque
static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
static void kReduceTorque(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
LAUNCHERROR("kReduceElectrostaticForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceElectrostaticTorque");
}
/**---------------------------------------------------------------------------------------
Compute Amoeba electrostatic force & torque
......@@ -728,7 +723,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaElectrostatic";
......@@ -752,7 +746,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
static const int maxSlots =20;
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
......@@ -761,6 +755,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
......@@ -772,70 +767,59 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
}
kClearFields_3( amoebaGpu, 2 );
kClearFields_3( amoebaGpu, 1 );
LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");
if (gpu->bOutputBufferPerWarp){
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u xnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
kReduceForceTorque( amoebaGpu );
kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque");
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
......@@ -847,13 +831,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
......@@ -925,10 +902,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psForce->_pSysData[offset],
amoebaGpu->psForce->_pSysData[offset+1],
amoebaGpu->psForce->_pSysData[offset+2],
(void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
......@@ -941,9 +915,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
}
}
......@@ -956,7 +929,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
}
......
......@@ -6,7 +6,7 @@
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -30,7 +30,6 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
// reduce psWorkArray_3_1 -> E_Field
// reduce psWorkArray_3_2 -> E_FieldPolar
// reduce psWorkArray_3_3 -> Gk_FieldPolar
......@@ -38,18 +37,19 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields2");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields3");
}
......@@ -330,8 +330,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
......@@ -342,13 +340,11 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
// print intermediate results for the targetAtom
unsigned int targetAtom = 0;
......@@ -356,6 +352,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
......@@ -372,21 +369,16 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
......@@ -396,14 +388,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} else {
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
......@@ -413,27 +400,13 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReduceEAndGkFields( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
......@@ -481,15 +454,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
}
}
(void) fflush( amoebaGpu->log );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printEFieldAtomBuffers( amoebaGpu, 100 );
//printEFieldBuffer( amoebaGpu, 0 );
//printEFieldBuffer( amoebaGpu, 1 );
//printEFieldBuffer( amoebaGpu, 37 );
//printEFieldBuffer( amoebaGpu, 38 );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download();
......@@ -497,7 +461,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
int ii = targetAtom;
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
unsigned int count = 0;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
......@@ -521,10 +484,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector, NULL, 1.0f);
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );
}
......@@ -532,5 +495,4 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
}
#endif
//exit(0);
}
......@@ -36,13 +36,14 @@ void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu)
static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceE_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceE_Fields2");
}
......@@ -73,12 +74,6 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
......@@ -104,6 +99,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 2 );
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
......@@ -118,15 +114,14 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", methodName,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -137,7 +132,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#endif
} else {
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -151,7 +146,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
......@@ -170,7 +165,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download();
......@@ -205,14 +200,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
}
(void) fflush( amoebaGpu->log );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printEFieldAtomBuffers( amoebaGpu, 100 );
//printEFieldBuffer( amoebaGpu, 0 );
//printEFieldBuffer( amoebaGpu, 1 );
//printEFieldBuffer( amoebaGpu, 37 );
//printEFieldBuffer( amoebaGpu, 38 );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download();
......
......@@ -129,7 +129,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
else // bExclusion
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -182,38 +182,38 @@ if( 0 && atomI == targetAtom ){
/*
pullBackIndex += 2;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
......@@ -222,12 +222,12 @@ if( 0 && atomI == targetAtom ){
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
......@@ -241,11 +241,11 @@ if( 0 && atomI == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif
......@@ -314,48 +314,48 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
......@@ -378,7 +378,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -435,38 +435,38 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
pullBackIndex += 2;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2];
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
......@@ -476,12 +476,12 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
......@@ -500,20 +500,20 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
......
......@@ -133,7 +133,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution
......@@ -172,29 +172,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, force, mask, 1.0f );
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torque[indexI], mask, 2.0f );
index = debugAccumulate( index, debugArray, torque[indexJ], mask, 3.0f );
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[0].x;
debugArray[index].y = pullBack[0].y;
debugArray[index].z = pullBack[0].z;
debugArray[index].w = pullBack[0].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[1].x;
debugArray[index].y = pullBack[1].y;
debugArray[index].z = pullBack[1].z;
debugArray[index].w = pullBack[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[2].x;
debugArray[index].y = pullBack[2].y;
debugArray[index].z = pullBack[2].z;
debugArray[index].w = pullBack[2].w;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dBorn[0];
debugArray[index].y = dBornPolar[0];
debugArray[index].z = dBorn[1];
......@@ -209,7 +209,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
......@@ -219,23 +219,17 @@ if( atomI == targetAtom || atomJ == targetAtom ){
of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3;
load3dArray( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4);
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
#endif
}
......@@ -278,7 +272,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
......@@ -325,25 +319,25 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, torque[indexI], mask, -2.0f );
index = debugAccumulate( index, debugArray, torque[indexJ], mask, -3.0f );
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[0].x;
debugArray[index].y = pullBack[0].y;
debugArray[index].z = pullBack[0].z;
debugArray[index].w = -1.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[1].x;
debugArray[index].y = pullBack[1].y;
debugArray[index].z = pullBack[1].z;
debugArray[index].w = pullBack[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[2].x;
debugArray[index].y = pullBack[2].y;
debugArray[index].z = pullBack[2].z;
debugArray[index].w = pullBack[2].w;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dBorn[0];
debugArray[index].y = dBornPolar[0];
debugArray[index].z = dBorn[1];
......@@ -355,7 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
//#ifdef AMOEBA_DEBUG
#if 0
if( mask || !mask ){
unsigned int index = atomJ + atomI*cAmoebaSim.paddedNumberOfAtoms;
unsigned int index = atomJ + atomI*cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = energy;
......@@ -372,7 +366,7 @@ if( mask || !mask ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
......@@ -382,12 +376,10 @@ if( mask || !mask ){
of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms;
offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum;
......@@ -397,31 +389,24 @@ if( mask || !mask ){
of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3;
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3;
load3dArray( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
offset = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = sA[threadIdx.x].dBornRadius;
cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar;
offset *= 3;
load3dArray( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
#endif
lasty = y;
......
......@@ -894,7 +894,7 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
#ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f;
......@@ -928,79 +928,18 @@ __device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffPartic
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaKirkwoodEDiff.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
// reduce psWorkArray_3_1 -> torque
static void kReduceForceTorque( amoebaGpuContext amoebaGpu )
static void kReduceTorque( amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodEDiffForce->_pDevData );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff2");
}
#ifdef AMOEBA_DEBUG
//#if 1
static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "KirkwoodEDiff Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
/*
start = 0;
stop = -146016;
float maxV = -1.0e+99;
for( unsigned int ii = start; ii < stop; ii += 3 ){
if( amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
}
}
*/
}
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
static void printKirkwoodEDiffAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "KirkwoodEDiff atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
}
#endif
/**---------------------------------------------------------------------------------------
......@@ -1016,7 +955,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
static int timestep = 0;
timestep++;
#ifdef AMOEBA_DEBUG
......@@ -1050,6 +988,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 6 );
LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
......@@ -1065,7 +1004,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock );
(void) fflush( amoebaGpu->log );
......@@ -1074,7 +1013,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
......@@ -1083,17 +1022,16 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
......@@ -1102,160 +1040,23 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
kReduceForceTorque( amoebaGpu );
kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psKirkwoodEDiffForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
if( ii == targetAtom ){
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d ediff F%T\n", ii, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
(void) fprintf( amoebaGpu->log,"\n" );
}
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
{
(void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
int maxPrint = 12;
for( int ii = 0; ii < gpu->natoms; ii++ ){
// print cpu & gpu reductions
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+2],
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
// map torques to forces
cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce, amoebaGpu->gpuContext->psForce4 );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
amoebaGpu->psKirkwoodEDiffForce->Download();
(void) fprintf( amoebaGpu->log, "Mapped KirkwoodEDiff torques to forces.\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
}
}
#endif
if( 0 ){
cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
}
// ---------------------------------------------------------------------------------------
}
......@@ -43,7 +43,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
float* inducedDipolePolar,
float* inducedDipoleS,
float* inducedDipolePolarS,
float* outputForce,
float* outputTorque
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
......@@ -138,7 +137,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution
......@@ -166,12 +165,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
debugArray[index].z = mask ? tinker_f*energy : 0.0f;
index = debugAccumulate( index, debugArray, force, mask, 1.0f );
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
}
......@@ -181,7 +180,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
else // bExclusion
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -203,7 +202,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#endif
);
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution
......@@ -233,7 +232,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, force, mask, 1.0f );
//mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
//mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
}
......@@ -249,15 +248,15 @@ if( atomI == targetAtom || atomJ == targetAtom ){
scale3dArray( tinker_f, localParticle.torque );
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, localParticle.force, outputForce );
load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, localParticle.force, outputForce );
load3dArray( offset, localParticle.torque, outputTorque );
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
#endif
......@@ -304,7 +303,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
......@@ -355,7 +354,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -378,7 +377,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
......@@ -435,26 +434,26 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, localParticle.force, outputForce );
load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, outputForce );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque );
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
load3dArray( offset, localParticle.force, outputForce );
load3dArray( offset, localParticle.torque, outputTorque );
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
load3dArray( offset, sA[threadIdx.x].force, outputForce );
load3dArray( offset, sA[threadIdx.x].torque, outputTorque );
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif
lasty = y;
......
......@@ -33,8 +33,8 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#define AMOEBA_DEBUG
//#undef AMOEBA_DEBUG
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
......@@ -216,7 +216,7 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
#ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f;
......@@ -256,7 +256,7 @@ void kInitializeMutualInducedAndGkField_kernel(
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms )return;
if( threadId >= 3*cSim.atoms )return;
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
......@@ -292,7 +292,7 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
// load deltas
while( pos < 3*cAmoebaSim.numberOfAtoms )
while( pos < 3*cSim.atoms )
{
delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos];
......@@ -324,12 +324,12 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
epsilon[0] = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0];
epsilon[0] = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
epsilon[0] = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) cAmoebaSim.numberOfAtoms ) );
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) cAmoebaSim.numberOfAtoms ) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) cAmoebaSim.numberOfAtoms ) );
epsilon[3] = 48.033324f*sqrtf( delta[0].z/( (float) cAmoebaSim.numberOfAtoms ) );
epsilon[4] = 48.033324f*sqrtf( delta[0].w/( (float) cAmoebaSim.numberOfAtoms ) );
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
epsilon[3] = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
epsilon[4] = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
#endif
}
}
......@@ -356,7 +356,7 @@ void kSorUpdateMutualInducedAndGkField_kernel(
float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms)return;
if( threadId >= 3*cSim.atoms)return;
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
......@@ -390,7 +390,7 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms)return;
if( threadId >= 3*cSim.atoms)return;
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
......@@ -415,23 +415,24 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray,
CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields2");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields3");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields4");
}
......@@ -441,12 +442,12 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
......@@ -461,8 +462,8 @@ static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferI
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
......@@ -530,8 +531,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
......@@ -539,7 +540,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
......@@ -552,7 +553,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#endif
} else {
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
......@@ -745,6 +746,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName ); fflush( amoebaGpu->log );
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(),
......@@ -753,7 +756,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
amoebaGpu->psInducedDipolePolarS->Download();
amoebaGpu->psPolarizability->Download();
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0;
int maxPrint = 10;
for( int ii = 0; ii < gpu->natoms; ii++ ){
......@@ -921,7 +923,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
}
#ifdef AMOEBA_DEBUG
if( 0 ){
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment