Commit 761d7e17 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of limitation for 'long-range in sequence' covalent bonds

Reduced memory footprint
parent 80c4976e
...@@ -112,8 +112,15 @@ void AmoebaCudaData::initializeGpu( void ) { ...@@ -112,8 +112,15 @@ void AmoebaCudaData::initializeGpu( void ) {
if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){ if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
throw OpenMMException("GK force requires Multipole force\n"); throw OpenMMException("GK force requires Multipole force\n");
} }
amoebaGpuBuildOutputBuffers( amoebaGpu, getHasAmoebaGeneralizedKirkwood() );
amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
amoebaGpuBuildVdwExclusionList( amoebaGpu );
amoebaGpuBuildScalingList( amoebaGpu );
amoebaGpuSetConstants( amoebaGpu ); amoebaGpuSetConstants( amoebaGpu );
gpuInitialized = true; gpuInitialized = true;
if( log ){ if( log ){
gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() ); gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
(void) fprintf( log, "Gpu initialized\n" ); (void) fprintf( log, "Gpu initialized\n" );
......
...@@ -93,7 +93,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl ...@@ -93,7 +93,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
if( mapIterator == contextToAmoebaDataMap.end() ){ if( mapIterator == contextToAmoebaDataMap.end() ){
amoebaCudaData = new AmoebaCudaData( cudaPlatformData ); amoebaCudaData = new AmoebaCudaData( cudaPlatformData );
contextToAmoebaDataMap[&context] = amoebaCudaData; contextToAmoebaDataMap[&context] = amoebaCudaData;
//amoebaCudaData->setLog( stderr ); amoebaCudaData->setLog( stderr );
amoebaCudaData->setContextImpl( static_cast<void*>(&context) ); amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
} else { } else {
amoebaCudaData = mapIterator->second; amoebaCudaData = mapIterator->second;
......
...@@ -99,7 +99,7 @@ extern void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext a ...@@ -99,7 +99,7 @@ extern void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext a
extern void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext gpu); extern void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext gpu);
extern void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ); extern void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu );
extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, char* fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1, extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1,
int entriesPerAtom2, CUDAStream<float>* array2 ); int entriesPerAtom2, CUDAStream<float>* array2 );
extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu ); extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
...@@ -112,9 +112,7 @@ extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu ); ...@@ -112,9 +112,7 @@ extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );
extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu); extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu); extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce); extern void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque );
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce, CUDAStream<float4>* psOutputForce);
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu ); extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu ); extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
...@@ -143,14 +141,14 @@ extern void cudaReduceN2ToN( float *N2Array, int N, float *NArray, int includeD ...@@ -143,14 +141,14 @@ extern void cudaReduceN2ToN( float *N2Array, int N, float *NArray, int includeD
extern float cudaGetSum( int numberOfElements, CUDAStream<float>* array ); extern float cudaGetSum( int numberOfElements, CUDAStream<float>* array );
extern float cudaGetNorm2( int numberOfElements, CUDAStream<float>* array ); extern float cudaGetNorm2( int numberOfElements, CUDAStream<float>* array );
extern int checkForNansAndInfinities( int numberOfElements, CUDAStream<float>* array ); extern int checkForNansAndInfinities( int numberOfElements, CUDAStream<float>* array );
extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, char* fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1, extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1,
int entriesPerAtom2, CUDAStream<float>* array2 ); int entriesPerAtom2, CUDAStream<float>* array2 );
extern void readFile( std::string fileName, StringVectorVector& fileContents ); extern void readFile( std::string fileName, StringVectorVector& fileContents );
extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion ); extern void cudaLoadCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, float conversion ); extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion ); extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
extern void cudaWriteVectorOfDoubleVectorsToFile( char* fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector ); extern void cudaWriteVectorOfDoubleVectorsToFile( const std::string& fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue ); extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
extern void checkForNans( int numberOfParticles, int entriesPerParticle, extern void checkForNans( int numberOfParticles, int entriesPerParticle,
CUDAStream<float>* array, int* order, int iteration, std::string idString, FILE* log ); CUDAStream<float>* array, int* order, int iteration, std::string idString, FILE* log );
...@@ -166,6 +164,8 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int ...@@ -166,6 +164,8 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
//extern int isNanOrInfinity( double number ); //extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration); extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>* outputStream, float conversion );
// PME // PME
......
...@@ -50,6 +50,7 @@ enum CudaAmoebaNonbondedMethod ...@@ -50,6 +50,7 @@ enum CudaAmoebaNonbondedMethod
static const int AMOEBA_PME_ORDER = 5; static const int AMOEBA_PME_ORDER = 5;
struct cudaAmoebaGmxSimulation { struct cudaAmoebaGmxSimulation {
// Constants // Constants
unsigned int amoebaBonds; // Number of bonds unsigned int amoebaBonds; // Number of bonds
...@@ -132,19 +133,20 @@ struct cudaAmoebaGmxSimulation { ...@@ -132,19 +133,20 @@ struct cudaAmoebaGmxSimulation {
float amoebaUreyBradleyQuarticicParameter; // quartic parameter float amoebaUreyBradleyQuarticicParameter; // quartic parameter
unsigned int amoebaUreyBradley_offset; // Offset to end of bonds unsigned int amoebaUreyBradley_offset; // Offset to end of bonds
unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms
//float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI) float sqrtPi; // sqrt(PI)
float scalingDistanceCutoff; // scaling cutoff float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors float2* pDampingFactorAndThole; // Thole & damping factors
int4* pMultipoleParticlesIdsAndAxisType; int4* pMultipoleParticlesIdsAndAxisType;
int* pMultipoleAxisOffset; int4* pMultipoleParticlesTorqueBufferIndices;
float4* pTorqueMapForce4;
float* pMolecularDipole; float* pMolecularDipole;
float* pMolecularQuadrupole; float* pMolecularQuadrupole;
float* pLabFrameDipole; float* pLabFrameDipole;
float* pLabFrameQuadrupole; float* pLabFrameQuadrupole;
float* pInducedDipole; float* pInducedDipole;
float* pInducedDipolePolar; float* pInducedDipolePolar;
...@@ -171,6 +173,7 @@ struct cudaAmoebaGmxSimulation { ...@@ -171,6 +173,7 @@ struct cudaAmoebaGmxSimulation {
int* pVdwExclusionIndices; int* pVdwExclusionIndices;
// WCA constants // WCA constants
float epso; float epso;
float epsh; float epsh;
float rmino; float rmino;
...@@ -180,7 +183,8 @@ struct cudaAmoebaGmxSimulation { ...@@ -180,7 +183,8 @@ struct cudaAmoebaGmxSimulation {
float dispoff; float dispoff;
float totalMaxWcaDispersionEnergy; float totalMaxWcaDispersionEnergy;
// scaling indices
// scaling indices
int* pScaleIndicesIndex; int* pScaleIndicesIndex;
int* pD_ScaleIndices; int* pD_ScaleIndices;
int2* pP_ScaleIndices; int2* pP_ScaleIndices;
......
...@@ -33,33 +33,31 @@ ...@@ -33,33 +33,31 @@
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#include <map> #include <map>
//using namespace std;
typedef std::map<int,float> MapIntFloat; typedef std::map<int,float> MapIntFloat;
//typedef MapIntFloat::iterator MapIntFloatI;
typedef MapIntFloat::const_iterator MapIntFloatCI; typedef MapIntFloat::const_iterator MapIntFloatCI;
/* Pointer to this structure will be given /*
* to gromacs functions*/ * Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct _amoebaGpuContext { struct _amoebaGpuContext {
_gpuContext* gpuContext; _gpuContext* gpuContext;
FILE* log; FILE* log;
// diagnostic arrays //bool bOutputBufferPerWarp;
//unsigned int paddedNumberOfAtoms;
MapIntFloat** pMapArray; //unsigned int nonbondBlocks;
MapIntFloat** dMapArray; //unsigned int nonbondThreadsPerBlock;
//unsigned int nonbondOutputBuffers;
bool bOutputBufferPerWarp; //unsigned int threadsPerBlock;
unsigned int paddedNumberOfAtoms; //unsigned int fieldReduceThreadsPerBlock;
unsigned int nonbondBlocks; //unsigned int outputBuffers;
unsigned int nonbondThreadsPerBlock;
unsigned int nonbondOutputBuffers;
unsigned int threadsPerBlock;
unsigned int fieldReduceThreadsPerBlock;
unsigned int outputBuffers;
unsigned int workUnits; unsigned int workUnits;
// workspace arrays // workspace arrays
...@@ -120,14 +118,19 @@ struct _amoebaGpuContext { ...@@ -120,14 +118,19 @@ struct _amoebaGpuContext {
float solventDielectric; float solventDielectric;
// rotation matrix
CUDAStream<float>* psRotationMatrix;
// multipole parameters // multipole parameters
CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType; CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
CUDAStream<int>* psMultipoleAxisOffset; CUDAStream<int>* psMultipoleAxisOffset;
// buffer indices used for mapping torques onto forces
int maxTorqueBufferIndex;
int useNewTorqueMapScheme;
int torqueMapForce4Delete;
CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
CUDAStream<float4>* psTorqueMapForce4;
CUDAStream<float>* psMolecularDipole; CUDAStream<float>* psMolecularDipole;
CUDAStream<float>* psMolecularQuadrupole; CUDAStream<float>* psMolecularQuadrupole;
...@@ -175,11 +178,7 @@ struct _amoebaGpuContext { ...@@ -175,11 +178,7 @@ struct _amoebaGpuContext {
// electrostatic // electrostatic
CUDAStream<float>* psForce;
CUDAStream<float>* psTorque; CUDAStream<float>* psTorque;
CUDAStream<float>* torqueMapForce;
int maxMapTorqueDifference;
int maxMapTorqueDifferencePow2;
// Kirkwood fields // Kirkwood fields
...@@ -188,8 +187,6 @@ struct _amoebaGpuContext { ...@@ -188,8 +187,6 @@ struct _amoebaGpuContext {
CUDAStream<float>* psInducedDipolePolarS; CUDAStream<float>* psInducedDipolePolarS;
CUDAStream<float>* psBorn; CUDAStream<float>* psBorn;
CUDAStream<float>* psBornPolar; CUDAStream<float>* psBornPolar;
CUDAStream<float>* psKirkwoodForce;
CUDAStream<float>* psKirkwoodEDiffForce;
int includeObcCavityTerm; int includeObcCavityTerm;
...@@ -208,6 +205,7 @@ struct _amoebaGpuContext { ...@@ -208,6 +205,7 @@ struct _amoebaGpuContext {
int vdwSigmaCombiningRule; int vdwSigmaCombiningRule;
int vdwEpsilonCombiningRule; int vdwEpsilonCombiningRule;
std::vector< std::vector<int> > vdwExclusions;
// Wca dispersion fields // Wca dispersion fields
...@@ -239,7 +237,7 @@ extern "C" ...@@ -239,7 +237,7 @@ extern "C"
void amoebaGpuShutDown(amoebaGpuContext gpu); void amoebaGpuShutDown(amoebaGpuContext gpu);
extern "C" extern "C"
void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu ); void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu, int hasKirkwood );
extern "C" extern "C"
int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext gpu ); int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext gpu );
...@@ -327,7 +325,7 @@ extern "C" ...@@ -327,7 +325,7 @@ extern "C"
void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ); void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
extern "C" extern "C"
void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu, const std::vector< std::vector<int> >& exclusions ); void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu );
extern "C" extern "C"
void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu, void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
......
...@@ -101,6 +101,30 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for ...@@ -101,6 +101,30 @@ __device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* for
} }
__device__ static void add3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
{
float4 of;
of = outputForce[offset];
of.x += forceSum[0];
of.y += forceSum[1];
of.z += forceSum[2];
outputForce[offset] = of;
}
__device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
{
float4 of;
of.x = forceSum[0];
of.y = forceSum[1];
of.z = forceSum[2];
of.w = 0.0f;
outputForce[offset] = of;
}
__device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce ) __device__ static void load3dArray( unsigned int offset, float* forceSum, float* outputForce )
{ {
...@@ -110,6 +134,15 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float* ...@@ -110,6 +134,15 @@ __device__ static void load3dArray( unsigned int offset, float* forceSum, float*
} }
__device__ static void add3dArray( unsigned int offset, float* forceSum, float* outputForce )
{
outputForce[offset] += forceSum[0];
outputForce[offset+1] += forceSum[1];
outputForce[offset+2] += forceSum[2];
}
__device__ static void scale3dArray( float scaleFactor, float* force ) __device__ static void scale3dArray( float scaleFactor, float* force )
{ {
......
...@@ -698,22 +698,17 @@ __device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsig ...@@ -698,22 +698,17 @@ __device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsig
#define METHOD_NAME(a, b) a##N2ByWarp##b #define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaElectrostatic.h" #include "kCalculateAmoebaCudaElectrostatic.h"
// reduce psWorkArray_3_1 -> force // reduce psWorkArray_3_1 -> torque
// reduce psWorkArray_3_2 -> torque
static void kReduceForceTorque(amoebaGpuContext amoebaGpu ) static void kReduceTorque(amoebaGpuContext amoebaGpu )
{ {
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData ); gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
LAUNCHERROR("kReduceElectrostaticForce"); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceElectrostaticTorque"); LAUNCHERROR("kReduceElectrostaticTorque");
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute Amoeba electrostatic force & torque Compute Amoeba electrostatic force & torque
...@@ -728,7 +723,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -728,7 +723,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaElectrostatic"; static const char* methodName = "cudaComputeAmoebaElectrostatic";
...@@ -752,7 +746,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -752,7 +746,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz ); methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
} }
static const int maxSlots =20; static const int maxSlots =20;
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray"); CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms); memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
...@@ -761,6 +755,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -761,6 +755,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block // on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
...@@ -772,70 +767,59 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -772,70 +767,59 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
} }
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 1 );
LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic"); LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");
if (gpu->bOutputBufferPerWarp){
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n", (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log ); sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData, amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
#else #else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif #endif
} else { } else {
#ifdef AMOEBA_DEBUG kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u xnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData, amoebaGpu->psLabFrameQuadrupole->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipole->_pDevData,
amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
#else #else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif #endif
} }
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
kReduceForceTorque( amoebaGpu ); kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque"); LAUNCHERROR("kReduceForceTorque");
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download(); amoebaGpu->psTorque->Download();
debugArray->Download(); debugArray->Download();
...@@ -847,13 +831,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -847,13 +831,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int indexOffset = ii*3; int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysData[indexOffset],
amoebaGpu->psForce->_pSysData[indexOffset+1],
amoebaGpu->psForce->_pSysData[indexOffset+2] );
// torque // torque
(void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ", (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
...@@ -925,10 +902,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -925,10 +902,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
int offset = 3*ii; int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii, (void) fprintf( amoebaGpu->log,"%6d T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psForce->_pSysData[offset],
amoebaGpu->psForce->_pSysData[offset+1],
amoebaGpu->psForce->_pSysData[offset+2],
amoebaGpu->psTorque->_pSysData[offset], amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1], amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] ); amoebaGpu->psTorque->_pSysData[offset+2] );
...@@ -941,9 +915,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -941,9 +915,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f ); cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaTorque", fileId, outputVector );
} }
} }
...@@ -956,7 +929,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -956,7 +929,7 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f ); //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 ); //cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, NULL, 1.0f/4.184 );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h" #include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG #define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
...@@ -30,7 +30,6 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu) ...@@ -30,7 +30,6 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation)); status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
} }
// reduce psWorkArray_3_1 -> E_Field // reduce psWorkArray_3_1 -> E_Field
// reduce psWorkArray_3_2 -> E_FieldPolar // reduce psWorkArray_3_2 -> E_FieldPolar
// reduce psWorkArray_3_3 -> Gk_FieldPolar // reduce psWorkArray_3_3 -> Gk_FieldPolar
...@@ -38,18 +37,19 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu) ...@@ -38,18 +37,19 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu ) static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
{ {
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields1"); LAUNCHERROR("kReduceEAndGK_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields2"); LAUNCHERROR("kReduceEAndGK_Fields2");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData ); amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData );
LAUNCHERROR("kReduceEAndGK_Fields3"); LAUNCHERROR("kReduceEAndGK_Fields3");
} }
...@@ -330,8 +330,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -330,8 +330,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -341,14 +339,12 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -341,14 +339,12 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
// N2 debug array // N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray"); int maxSlots = 10;
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms); CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload(); debugArray->Upload();
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
// print intermediate results for the targetAtom // print intermediate results for the targetAtom
unsigned int targetAtom = 0; unsigned int targetAtom = 0;
...@@ -356,6 +352,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -356,6 +352,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// on first pass, set threads/block // on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
...@@ -372,21 +369,16 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -372,21 +369,16 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName, (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
...@@ -396,14 +388,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -396,14 +388,9 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} else { } else {
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData,
amoebaGpu->psLabFrameQuadrupole->_pDevData,
gpu->psBornRadii->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
...@@ -413,27 +400,13 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -413,27 +400,13 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} }
LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel"); LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReduceEAndGkFields( amoebaGpu ); kReduceEAndGkFields( amoebaGpu );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -481,15 +454,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -481,15 +454,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} }
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printEFieldAtomBuffers( amoebaGpu, 100 );
//printEFieldBuffer( amoebaGpu, 0 );
//printEFieldBuffer( amoebaGpu, 1 );
//printEFieldBuffer( amoebaGpu, 37 );
//printEFieldBuffer( amoebaGpu, 38 );
(void) fprintf( amoebaGpu->log, "EFields End\n" ); (void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" ); (void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download(); debugArray->Download();
...@@ -497,7 +461,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -497,7 +461,6 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
int ii = targetAtom; int ii = targetAtom;
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
unsigned int count = 0;
for( int jj = 0; jj < gpu->natoms; jj++ ){ for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj; int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ", (void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
...@@ -521,10 +484,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -521,10 +484,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL, 1.0f ); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector, NULL, 1.0f); cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector, NULL, 1.0f);
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector ); cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );
} }
...@@ -532,5 +495,4 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -532,5 +495,4 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} }
#endif #endif
//exit(0);
} }
...@@ -36,13 +36,14 @@ void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu) ...@@ -36,13 +36,14 @@ void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu)
static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu ) static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
{ {
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReduceE_Fields1"); LAUNCHERROR("kReduceE_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReduceE_Fields2"); LAUNCHERROR("kReduceE_Fields2");
} }
...@@ -73,12 +74,6 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu ) ...@@ -73,12 +74,6 @@ static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
{ {
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -104,6 +99,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -104,6 +99,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
...@@ -118,15 +114,14 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -118,15 +114,14 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", methodName, (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", methodName,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -137,7 +132,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -137,7 +132,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
#endif #endif
} else { } else {
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -151,7 +146,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -151,7 +146,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel"); LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
#if 0 #if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){ for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
//float index = 1.0f; //float index = 1.0f;
float index = (float) ii; float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){ for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
...@@ -170,7 +165,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -170,7 +165,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download(); gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n", (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download(); amoebaGpu->psWorkArray_3_1->Download();
...@@ -205,14 +200,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -205,14 +200,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printEFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printEFieldAtomBuffers( amoebaGpu, 100 );
//printEFieldBuffer( amoebaGpu, 0 );
//printEFieldBuffer( amoebaGpu, 1 );
//printEFieldBuffer( amoebaGpu, 37 );
//printEFieldBuffer( amoebaGpu, 38 );
(void) fprintf( amoebaGpu->log, "EFields End\n" ); (void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" ); (void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download(); debugArray->Download();
......
...@@ -129,7 +129,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)( ...@@ -129,7 +129,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
else // bExclusion else // bExclusion
{ {
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
...@@ -182,38 +182,38 @@ if( 0 && atomI == targetAtom ){ ...@@ -182,38 +182,38 @@ if( 0 && atomI == targetAtom ){
/* /*
pullBackIndex += 2; pullBackIndex += 2;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
*/ */
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0]; debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1]; debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2]; debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0]; debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1]; debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2]; debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0]; debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1]; debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2]; debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/* /*
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j; unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0; unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0; pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
...@@ -222,12 +222,12 @@ if( 0 && atomI == targetAtom ){ ...@@ -222,12 +222,12 @@ if( 0 && atomI == targetAtom ){
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f; debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f; debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f; debugArray[index].w = pScaleVal + 10.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x; debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y; debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z; debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x; debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y; debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z; debugArray[index].z = iCoord.z;
...@@ -241,11 +241,11 @@ if( 0 && atomI == targetAtom ){ ...@@ -241,11 +241,11 @@ if( 0 && atomI == targetAtom ){
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField ); load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar ); load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
#else #else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField ); load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar ); load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif #endif
...@@ -314,48 +314,48 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){ ...@@ -314,48 +314,48 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++]; debugArray[index].w = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
*/ */
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0]; debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1]; debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2]; debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = ijField[indexI][2];
#if 0 #if 0
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x; debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y; debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z; debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x; debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y; debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z; debugArray[index].z = iCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j; unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0; unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0; pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
...@@ -378,7 +378,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){ ...@@ -378,7 +378,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS; unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2; unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
...@@ -435,38 +435,38 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){ ...@@ -435,38 +435,38 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
pullBackIndex += 2; pullBackIndex += 2;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++]; debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++]; debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++]; debugArray[index].z = pullBack[pullBackIndex++];
*/ */
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0]; debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1]; debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2]; debugArray[index].z = ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0]; debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1]; debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2]; debugArray[index].z = ijField[indexJ][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0]; debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1]; debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2]; debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0]; debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1]; debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2]; debugArray[index].z = pScaleVal*ijField[indexI][2];
/* /*
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j; unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0; unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0; pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
...@@ -476,12 +476,12 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){ ...@@ -476,12 +476,12 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f; debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f; debugArray[index].w = pScaleVal + 10.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x; debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y; debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z; debugArray[index].z = jCoord.z;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x; debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y; debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z; debugArray[index].z = iCoord.z;
...@@ -500,20 +500,20 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){ ...@@ -500,20 +500,20 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField ); load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar ); load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField ); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar ); load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField ); load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar ); load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField ); load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar ); load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
......
...@@ -133,7 +133,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)( ...@@ -133,7 +133,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
#endif #endif
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution // torques include i == j contribution
...@@ -172,29 +172,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -172,29 +172,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, force, mask, 1.0f ); index = debugAccumulate( index, debugArray, force, mask, 1.0f );
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torque[indexI], mask, 2.0f ); index = debugAccumulate( index, debugArray, torque[indexI], mask, 2.0f );
index = debugAccumulate( index, debugArray, torque[indexJ], mask, 3.0f ); index = debugAccumulate( index, debugArray, torque[indexJ], mask, 3.0f );
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[0].x; debugArray[index].x = pullBack[0].x;
debugArray[index].y = pullBack[0].y; debugArray[index].y = pullBack[0].y;
debugArray[index].z = pullBack[0].z; debugArray[index].z = pullBack[0].z;
debugArray[index].w = pullBack[0].w; debugArray[index].w = pullBack[0].w;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[1].x; debugArray[index].x = pullBack[1].x;
debugArray[index].y = pullBack[1].y; debugArray[index].y = pullBack[1].y;
debugArray[index].z = pullBack[1].z; debugArray[index].z = pullBack[1].z;
debugArray[index].w = pullBack[1].w; debugArray[index].w = pullBack[1].w;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[2].x; debugArray[index].x = pullBack[2].x;
debugArray[index].y = pullBack[2].y; debugArray[index].y = pullBack[2].y;
debugArray[index].z = pullBack[2].z; debugArray[index].z = pullBack[2].z;
debugArray[index].w = pullBack[2].w; debugArray[index].w = pullBack[2].w;
/* /*
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dBorn[0]; debugArray[index].x = dBorn[0];
debugArray[index].y = dBornPolar[0]; debugArray[index].y = dBornPolar[0];
debugArray[index].z = dBorn[1]; debugArray[index].z = dBorn[1];
...@@ -209,7 +209,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -209,7 +209,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
float of; float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset]; of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum; of += dBornSum;
...@@ -219,23 +219,17 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -219,23 +219,17 @@ if( atomI == targetAtom || atomJ == targetAtom ){
of += dBornPolarSum; of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
offset *= 3; load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
#else #else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum; cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum; cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3; add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4);
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
#endif #endif
} }
...@@ -278,7 +272,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -278,7 +272,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#endif #endif
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
...@@ -325,25 +319,25 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -325,25 +319,25 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, torque[indexI], mask, -2.0f ); index = debugAccumulate( index, debugArray, torque[indexI], mask, -2.0f );
index = debugAccumulate( index, debugArray, torque[indexJ], mask, -3.0f ); index = debugAccumulate( index, debugArray, torque[indexJ], mask, -3.0f );
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[0].x; debugArray[index].x = pullBack[0].x;
debugArray[index].y = pullBack[0].y; debugArray[index].y = pullBack[0].y;
debugArray[index].z = pullBack[0].z; debugArray[index].z = pullBack[0].z;
debugArray[index].w = -1.0f; debugArray[index].w = -1.0f;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[1].x; debugArray[index].x = pullBack[1].x;
debugArray[index].y = pullBack[1].y; debugArray[index].y = pullBack[1].y;
debugArray[index].z = pullBack[1].z; debugArray[index].z = pullBack[1].z;
debugArray[index].w = pullBack[1].w; debugArray[index].w = pullBack[1].w;
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[2].x; debugArray[index].x = pullBack[2].x;
debugArray[index].y = pullBack[2].y; debugArray[index].y = pullBack[2].y;
debugArray[index].z = pullBack[2].z; debugArray[index].z = pullBack[2].z;
debugArray[index].w = pullBack[2].w; debugArray[index].w = pullBack[2].w;
/* /*
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dBorn[0]; debugArray[index].x = dBorn[0];
debugArray[index].y = dBornPolar[0]; debugArray[index].y = dBornPolar[0];
debugArray[index].z = dBorn[1]; debugArray[index].z = dBorn[1];
...@@ -355,7 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -355,7 +349,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
//#ifdef AMOEBA_DEBUG //#ifdef AMOEBA_DEBUG
#if 0 #if 0
if( mask || !mask ){ if( mask || !mask ){
unsigned int index = atomJ + atomI*cAmoebaSim.paddedNumberOfAtoms; unsigned int index = atomJ + atomI*cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ; debugArray[index].y = (float) atomJ;
debugArray[index].z = energy; debugArray[index].z = energy;
...@@ -372,7 +366,7 @@ if( mask || !mask ){ ...@@ -372,7 +366,7 @@ if( mask || !mask ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
float of; float of;
unsigned int offset = x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset]; of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum; of += dBornSum;
...@@ -382,12 +376,10 @@ if( mask || !mask ){ ...@@ -382,12 +376,10 @@ if( mask || !mask ){
of += dBornPolarSum; of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3; add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
offset = y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms; offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
of = cAmoebaSim.pWorkArray_1_1[offset]; of = cAmoebaSim.pWorkArray_1_1[offset];
of += dBornSum; of += dBornSum;
...@@ -397,31 +389,24 @@ if( mask || !mask ){ ...@@ -397,31 +389,24 @@ if( mask || !mask ){
of += dBornPolarSum; of += dBornPolarSum;
cAmoebaSim.pWorkArray_1_2[offset] = of; cAmoebaSim.pWorkArray_1_2[offset] = of;
offset *= 3; add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
#else #else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms; unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = dBornSum; cAmoebaSim.pWorkArray_1_1[offset] = dBornSum;
cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum; cAmoebaSim.pWorkArray_1_2[offset] = dBornPolarSum;
offset *= 3; add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, localParticle.torque, cAmoebaSim.pWorkArray_3_2 );
offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
offset = y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms;
cAmoebaSim.pWorkArray_1_1[offset] = sA[threadIdx.x].dBornRadius; cAmoebaSim.pWorkArray_1_1[offset] = sA[threadIdx.x].dBornRadius;
cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar; cAmoebaSim.pWorkArray_1_2[offset] = sA[threadIdx.x].dBornRadiusPolar;
offset *= 3; add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, sA[threadIdx.x].force, cAmoebaSim.pWorkArray_3_1 );
load3dArray( offset, sA[threadIdx.x].torque, cAmoebaSim.pWorkArray_3_2 );
#endif #endif
lasty = y; lasty = y;
......
...@@ -894,7 +894,7 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato ...@@ -894,7 +894,7 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel ) __device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{ {
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f; debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f; debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f; debugArray[index].z = addMask ? field[2] : 0.0f;
...@@ -928,79 +928,18 @@ __device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffPartic ...@@ -928,79 +928,18 @@ __device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffPartic
#define METHOD_NAME(a, b) a##N2ByWarp##b #define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaKirkwoodEDiff.h" #include "kCalculateAmoebaCudaKirkwoodEDiff.h"
// reduce psWorkArray_3_1 -> force // reduce psWorkArray_3_1 -> torque
// reduce psWorkArray_3_2 -> torque
static void kReduceForceTorque( amoebaGpuContext amoebaGpu ) static void kReduceTorque( amoebaGpuContext amoebaGpu )
{ {
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psKirkwoodEDiffForce->_pDevData ); gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff1"); amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff2");
}
#ifdef AMOEBA_DEBUG
//#if 1
static void printKirkwoodEDiffBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "KirkwoodEDiff Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
/*
start = 0;
stop = -146016;
float maxV = -1.0e+99;
for( unsigned int ii = start; ii < stop; ii += 3 ){
if( amoebaGpu->psWorkArray_3_1->_pSysData[ii] > maxV ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii] );
maxV = amoebaGpu->psWorkArray_3_1->_pSysData[ii];
}
}
*/
}
static void printKirkwoodEDiffAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom ) LAUNCHERROR("kReduceForceTorqueKirkwoodEDiff");
{
(void) fprintf( amoebaGpu->log, "KirkwoodEDiff atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
} }
#endif
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -1016,7 +955,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1016,7 +955,6 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
static int timestep = 0; static int timestep = 0;
timestep++; timestep++;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -1050,6 +988,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1050,6 +988,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 6 ); kClearFields_3( amoebaGpu, 6 );
LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff"); LAUNCHERROR("kClearFields_3_kCalculateAmoebaCudaKirkwoodEDiff");
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){ if( threadsPerBlock == 0 ){
unsigned int maxThreads; unsigned int maxThreads;
if (gpu->sm_version >= SM_20) if (gpu->sm_version >= SM_20)
...@@ -1065,7 +1004,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1065,7 +1004,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if( amoebaGpu->log && timestep == 1 ){ if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu" (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n", " ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock, sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -1074,7 +1013,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1074,7 +1013,7 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psLabFrameDipole->_pDevData,
...@@ -1083,17 +1022,16 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1083,17 +1022,16 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
#else #else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif #endif
} else { } else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData, gpu->psPosq4->_pDevData,
amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psLabFrameDipole->_pDevData,
...@@ -1102,160 +1040,23 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -1102,160 +1040,23 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipoleS->_pDevData,
amoebaGpu->psInducedDipolePolarS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom ); debugArray->_pDevData, targetAtom );
#else #else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif #endif
} }
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
kReduceForceTorque( amoebaGpu ); kReduceTorque( amoebaGpu );
LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff"); LAUNCHERROR("kReduceForceTorque_kCalculateAmoebaCudaKirkwoodEDiff");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psKirkwoodEDiffForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"T [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysData[indexOffset],
amoebaGpu->psTorque->_pSysData[indexOffset+1],
amoebaGpu->psTorque->_pSysData[indexOffset+2] );
if( ii == targetAtom ){
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d ediff F%T\n", ii, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
(void) fprintf( amoebaGpu->log,"\n" );
}
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
{
(void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
int maxPrint = 12;
for( int ii = 0; ii < gpu->natoms; ii++ ){
// print cpu & gpu reductions
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[offset+2],
amoebaGpu->psTorque->_pSysData[offset],
amoebaGpu->psTorque->_pSysData[offset+1],
amoebaGpu->psTorque->_pSysData[offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, NULL, 1.0f);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
// map torques to forces // map torques to forces
cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce, amoebaGpu->gpuContext->psForce4 ); cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
amoebaGpu->psKirkwoodEDiffForce->Download();
(void) fprintf( amoebaGpu->log, "Mapped KirkwoodEDiff torques to forces.\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"KirkwoodEDiffF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+1],
amoebaGpu->psKirkwoodEDiffForce->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
}
}
#endif
if( 0 ){
cudaComputeAmoebaMapTorques( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psKirkwoodEDiffForce );
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psKirkwoodEDiffForce, outputVector, NULL, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaKirkwoodEDiffForce", fileId, outputVector );
}
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
} }
...@@ -43,7 +43,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -43,7 +43,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
float* inducedDipolePolar, float* inducedDipolePolar,
float* inducedDipoleS, float* inducedDipoleS,
float* inducedDipolePolarS, float* inducedDipolePolarS,
float* outputForce,
float* outputTorque float* outputTorque
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom , float4* debugArray, unsigned int targetAtom
...@@ -138,7 +137,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -138,7 +137,7 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
#endif #endif
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution // torques include i == j contribution
...@@ -166,12 +165,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -166,12 +165,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].x = (float) atomI; debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ; debugArray[index].y = (float) atomJ;
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
debugArray[index].z = mask ? tinker_f*energy : 0.0f; debugArray[index].z = mask ? tinker_f*energy : 0.0f;
index = debugAccumulate( index, debugArray, force, mask, 1.0f ); index = debugAccumulate( index, debugArray, force, mask, 1.0f );
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f ); index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f ); index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
} }
...@@ -181,7 +180,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -181,7 +180,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
else // bExclusion else // bExclusion
{ {
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2; unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
...@@ -203,7 +202,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -203,7 +202,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#endif #endif
); );
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
// torques include i == j contribution // torques include i == j contribution
...@@ -233,7 +232,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -233,7 +232,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
index = debugAccumulate( index, debugArray, force, mask, 1.0f ); index = debugAccumulate( index, debugArray, force, mask, 1.0f );
//mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; //mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f ); index = debugAccumulate( index, debugArray, torqueIPtr, mask, 2.0f );
index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f ); index = debugAccumulate( index, debugArray, torqueJPtr, mask, 3.0f );
} }
...@@ -249,15 +248,15 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -249,15 +248,15 @@ if( atomI == targetAtom || atomJ == targetAtom ){
scale3dArray( tinker_f, localParticle.torque ); scale3dArray( tinker_f, localParticle.torque );
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, localParticle.force, outputForce ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque ); load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
#else #else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
load3dArray( offset, localParticle.force, outputForce ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( offset, localParticle.torque, outputTorque ); load3dArray( 3*offset, localParticle.torque, outputTorque );
#endif #endif
...@@ -304,7 +303,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -304,7 +303,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
...@@ -355,7 +354,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -355,7 +354,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int xi = x >> GRIDBITS; unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS; unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2; unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx]; int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
...@@ -378,7 +377,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -378,7 +377,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
); );
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || ( atomJ >= cSim.atoms) ) ? 0 : 1;
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
...@@ -435,26 +434,26 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -435,26 +434,26 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = x + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, localParticle.force, outputForce ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque ); load3dArrayBufferPerWarp( 3*offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); offset = y + tgx + warp*cSim.paddedNumberOfAtoms;
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, outputForce ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].torque, outputTorque ); load3dArrayBufferPerWarp( 3*offset, sA[threadIdx.x].torque, outputTorque );
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms;
load3dArray( offset, localParticle.force, outputForce ); add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( offset, localParticle.torque, outputTorque ); load3dArray( 3*offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); offset = y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms;
load3dArray( offset, sA[threadIdx.x].force, outputForce ); add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( offset, sA[threadIdx.x].torque, outputTorque ); load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif #endif
lasty = y; lasty = y;
......
...@@ -1626,7 +1626,7 @@ void kCalculateAmoebaLocalForces_kernel() ...@@ -1626,7 +1626,7 @@ void kCalculateAmoebaLocalForces_kernel()
void kCalculateAmoebaLocalForces(amoebaGpuContext gpu) void kCalculateAmoebaLocalForces(amoebaGpuContext gpu)
{ {
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( gpu->log ){ if( gpu->log ){
static int call = 1; static int call = 1;
if( call == 0 ){ if( call == 0 ){
......
...@@ -33,8 +33,8 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp ...@@ -33,8 +33,8 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
} }
//#define AMOEBA_DEBUG #define AMOEBA_DEBUG
#undef AMOEBA_DEBUG //#undef AMOEBA_DEBUG
#define GK #define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h" #include "kCalculateAmoebaCudaMutualInducedParticle.h"
...@@ -216,7 +216,7 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced ...@@ -216,7 +216,7 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel ) __device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{ {
index += cAmoebaSim.paddedNumberOfAtoms; index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f; debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f; debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f; debugArray[index].z = addMask ? field[2] : 0.0f;
...@@ -256,7 +256,7 @@ void kInitializeMutualInducedAndGkField_kernel( ...@@ -256,7 +256,7 @@ void kInitializeMutualInducedAndGkField_kernel(
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms )return; if( threadId >= 3*cSim.atoms )return;
fixedEField[threadId] *= polarizability[threadId]; fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipole[threadId] = fixedEField[threadId];
...@@ -292,7 +292,7 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a ...@@ -292,7 +292,7 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
// load deltas // load deltas
while( pos < 3*cAmoebaSim.numberOfAtoms ) while( pos < 3*cSim.atoms )
{ {
delta[threadIdx.x].x += arrayOfDeltas1[pos]; delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos]; delta[threadIdx.x].y += arrayOfDeltas2[pos];
...@@ -324,12 +324,12 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a ...@@ -324,12 +324,12 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
epsilon[0] = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0]; epsilon[0] = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0];
epsilon[0] = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0]; epsilon[0] = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
epsilon[0] = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0]; epsilon[0] = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) cAmoebaSim.numberOfAtoms ) ); epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) cAmoebaSim.numberOfAtoms ) ); epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) cAmoebaSim.numberOfAtoms ) ); epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
epsilon[3] = 48.033324f*sqrtf( delta[0].z/( (float) cAmoebaSim.numberOfAtoms ) ); epsilon[3] = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
epsilon[4] = 48.033324f*sqrtf( delta[0].w/( (float) cAmoebaSim.numberOfAtoms ) ); epsilon[4] = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
#endif #endif
} }
} }
...@@ -356,7 +356,7 @@ void kSorUpdateMutualInducedAndGkField_kernel( ...@@ -356,7 +356,7 @@ void kSorUpdateMutualInducedAndGkField_kernel(
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms)return; if( threadId >= 3*cSim.atoms)return;
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[threadId];
...@@ -390,7 +390,7 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel( ...@@ -390,7 +390,7 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cAmoebaSim.numberOfAtoms)return; if( threadId >= 3*cSim.atoms)return;
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[threadId];
...@@ -415,23 +415,24 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu, ...@@ -415,23 +415,24 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray,
CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS ) CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{ {
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields1"); LAUNCHERROR("kReduceMutualInducedAndGkFields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields2"); LAUNCHERROR("kReduceMutualInducedAndGkFields2");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData ); amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields3"); LAUNCHERROR("kReduceMutualInducedAndGkFields3");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>( kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers, gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData ); amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
LAUNCHERROR("kReduceMutualInducedAndGkFields4"); LAUNCHERROR("kReduceMutualInducedAndGkFields4");
} }
...@@ -441,12 +442,12 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu, ...@@ -441,12 +442,12 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex ) static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{ {
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex ); (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms; unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms; unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){ for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3; unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms); unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms); unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", (void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex, ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii], amoebaGpu->psWorkArray_3_1->_pSysData[ii],
...@@ -461,8 +462,8 @@ static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferI ...@@ -461,8 +462,8 @@ static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferI
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom ) static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{ {
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom ); (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){ for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms); unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex, ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex], amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
...@@ -530,8 +531,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -530,8 +531,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
...@@ -539,7 +540,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -539,7 +540,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#endif #endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
...@@ -552,7 +553,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -552,7 +553,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
#endif #endif
} else { } else {
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
...@@ -709,7 +710,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -709,7 +710,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
int done; int done;
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3; int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems ); int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads; int numBlocks = numOfElems/numThreads;
...@@ -745,6 +746,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -745,6 +746,8 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName ); fflush( amoebaGpu->log );
amoebaGpu->psE_Field->Download(); amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download(); amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(), amoebaGpu->psInducedDipole->Download(),
...@@ -753,7 +756,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -753,7 +756,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
amoebaGpu->psInducedDipolePolarS->Download(); amoebaGpu->psInducedDipolePolarS->Download();
amoebaGpu->psPolarizability->Download(); amoebaGpu->psPolarizability->Download();
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0; int offset = 0;
int maxPrint = 10; int maxPrint = 10;
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->natoms; ii++ ){
...@@ -921,7 +923,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -921,7 +923,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
} }
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( 0 ){ if( 1 ){
std::vector<int> fileId; std::vector<int> fileId;
//fileId.push_back( 0 ); //fileId.push_back( 0 );
VectorOfDoubleVectors outputVector; VectorOfDoubleVectors outputVector;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment