Commit 548efdfb authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removed amoebaGpu.psWorkUnit stream

Fixed problem w/ AmoebaVdwForce.areParticlesIdentical() that was preventing particles from being reordered
parent cd93e5e0
......@@ -55,6 +55,7 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){
//fprintf( stderr, "In CalcAmoebaForcesAndEnergyKernel::beginComputation reordering atoms\n" ); fflush( stderr );
gpuReorderAtoms(gpu);
}
......@@ -72,7 +73,6 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo
}
double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "IN CalcAmoebaForcesAndEnergyKernel::finishComputation\n" ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
......@@ -865,14 +865,19 @@ public:
vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2)
if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2){
return false;
for (int i = 0; i < (int) dipole1.size(); ++i)
if (dipole1[i] != dipole2[i])
}
for (int i = 0; i < (int) dipole1.size(); ++i){
if (dipole1[i] != dipole2[i]){
return false;
for (int i = 0; i < (int) quadrupole1.size(); ++i)
if (quadrupole1[i] != quadrupole2[i])
}
}
for (int i = 0; i < (int) quadrupole1.size(); ++i){
if (quadrupole1[i] != quadrupole2[i]){
return false;
}
}
return true;
}
private:
......@@ -1142,7 +1147,7 @@ public:
double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
force.getParticleParameters(particle1, iv1, class1, sigma1, epsilon1, reduction1);
force.getParticleParameters(particle2, iv2, class2, sigma2, epsilon2, reduction2);
return (iv1 == iv2 && class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
return (class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
}
private:
const AmoebaVdwForce& force;
......
......@@ -45,9 +45,11 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
#include <limits>
#include <cstring>
#include <vector>
#include <stdio.h>
#ifdef WIN32
// #include <windows.h>
#include <windows.h>
#else
#include <sys/time.h>
#endif
#define DUMP_PARAMETERS 0
......@@ -235,7 +237,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, "\n\n" );
totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log );
totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->gpuContext->psWorkUnit, log );
totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psScalingIndicesIndex, log );
totalMemory += gpuPrintCudaStreamInt( amoebaGpu->ps_D_ScaleIndices, log );
totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log );
......@@ -2709,7 +2711,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete gpu->psWorkArray_1_1;
delete gpu->psWorkArray_1_2;
delete gpu->psWorkUnit;
delete gpu->psScalingIndicesIndex;
delete gpu->ps_D_ScaleIndices;
delete gpu->ps_P_ScaleIndices;
......@@ -2852,7 +2853,7 @@ extern "C"
int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu )
{
if( amoebaGpu->psWorkUnit != NULL ){
if( amoebaGpu->psVdwWorkUnit != NULL ){
return 0;
}
const unsigned int atoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
......@@ -2860,10 +2861,21 @@ int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu )
const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
CUDAStream<unsigned int>* psWorkUnit;
if( amoebaGpu->gpuContext->psWorkUnit == NULL ){
psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
amoebaGpu->gpuContext->psWorkUnit = psWorkUnit;
} else {
psWorkUnit = amoebaGpu->gpuContext->psWorkUnit;
if( psWorkUnit->_length < cells ){
delete psWorkUnit;
psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
amoebaGpu->gpuContext->psWorkUnit = psWorkUnit;
}
}
unsigned int* pWorkList = psWorkUnit->_pSysData;
amoebaGpu->psWorkUnit = psWorkUnit;
memset( amoebaGpu->psWorkUnit->_pSysData, 0, cells*sizeof( unsigned int) );
memset( psWorkUnit->_pSysData, 0, cells*sizeof( unsigned int) );
CUDAStream<unsigned int>* psVdwWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "VdwWorkUnit");
unsigned int* pVdwWorkList = psVdwWorkUnit->_pSysData;
......@@ -2909,7 +2921,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
const unsigned int grid = amoebaGpu->gpuContext->grid;
const unsigned int dim = paddedAtoms/grid;
const unsigned int cells = dim * (dim + 1) / 2;
unsigned int* pWorkList = amoebaGpu->psWorkUnit->_pSysData;
unsigned int* pWorkList = amoebaGpu->gpuContext->psWorkUnit->_pSysData;
// minCellIndex & maxCellIndex track min/max atom index for each cell
......@@ -2981,7 +2993,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
int xAtomMin = x*grid;
int xAtomMax = xAtomMin + gridOffset;
if( (maxCellIndex[y] >= xAtomMin && minCellIndex[y] <= xAtomMax) || (x == lastBlock || y == lastBlock) ){
pWorkList[ii] = encodeCellExclusion( pWorkList[ii] );
pWorkList[ii] = encodeCellExclusion( pWorkList[ii] );
psScalingIndicesIndex->_pSysData[ii] = numWithScalingIndices*grid;
numWithScalingIndices++;
//(void) fprintf( amoebaGpu->log, "%5d [%6d %6d] [%6d %6d] [%6d %6d] num=%5d last=%5d\n",
......@@ -3446,7 +3458,7 @@ tgx = 0;
amoebaGpu->ps_P_ScaleIndices->Upload();
amoebaGpu->ps_M_ScaleIndices->Upload();
amoebaGpu->psScalingIndicesIndex->Upload();
amoebaGpu->psWorkUnit->Upload();
amoebaGpu->gpuContext->psWorkUnit->Upload();
}
/**---------------------------------------------------------------------------------------
......@@ -4501,3 +4513,40 @@ if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e
outputStream->Upload();
}
/**
* Get time of day (implementation different for Linux/Windows
*
* @return time
*
*/
double getTimeOfDay( void ){
#ifdef WIN32
static double cycles_per_usec = 0;
LARGE_INTEGER counter;
if (cycles_per_usec == 0) {
static LARGE_INTEGER lFreq;
if (!QueryPerformanceFrequency(&lFreq)) {
fprintf(stderr, "Unable to read the performance counter frquency!\n");
return 0;
}
cycles_per_usec = 1000000 / ((double) lFreq.QuadPart);
}
if (!QueryPerformanceCounter(&counter)) {
fprintf(stderr,"Unable to read the performance counter!\n");
return 0;
}
double time = ((((double) counter.QuadPart) * cycles_per_usec));
return time*1.0e-06;
#else
struct timeval tv;
gettimeofday(&tv,NULL);
return static_cast<double>(tv.tv_sec) + 1.0e-06*static_cast<double>(tv.tv_usec);
#endif
}
......@@ -175,7 +175,8 @@ extern void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu);
extern void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu);
extern void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu);
extern void SetCalculateAmoebaCudaUtilitiesSim( amoebaGpuContext amoebaGpu );
double getTimeOfDay( void );
#endif //__AMOEBA_GPU_TYPES_H__
......@@ -90,7 +90,6 @@ struct _amoebaGpuContext {
CUDAStream<float>* psWorkArray_1_1;
CUDAStream<float>* psWorkArray_1_2;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<int>* psScalingIndicesIndex;
CUDAStream<int>* ps_D_ScaleIndices;
CUDAStream<int2>* ps_P_ScaleIndices;
......
......@@ -679,10 +679,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} else {
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
......
......@@ -378,7 +378,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
......@@ -390,7 +390,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} else {
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
......
......@@ -119,7 +119,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
......@@ -130,7 +130,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
} else {
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
......
......@@ -1946,7 +1946,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData
gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom );
#else
......@@ -1955,7 +1955,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
} else {
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData
gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom );
#else
......
......@@ -973,12 +973,12 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
......
......@@ -505,7 +505,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData,
......@@ -518,7 +518,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
} else {
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData,
......
......@@ -289,7 +289,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
......@@ -301,7 +301,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
} else {
kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
......
......@@ -37,7 +37,7 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
//#define INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
......@@ -141,12 +141,7 @@ __device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float indu
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float4 fields[3]
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
){
float uscale, float4 fields[3] ){
// compute the real space portion of the Ewald summation
......@@ -254,27 +249,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
/*
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
pullBack[0].z = zr;
pullBack[0].w = r2;
pullBack[1].x = alsq2;
pullBack[1].y = bn0;
pullBack[1].z = bn2;
pullBack[1].w = exp2a;
pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z;
pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
pullBack[1].x = scale3;
pullBack[1].y = scale5;
pullBack[1].z = scale7;
#endif
*/
}
// Include versions of the kernels for N^2 calculations.
......@@ -453,11 +427,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
(void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
#endif
kClearFields_3( amoebaGpu, 2 );
......@@ -477,6 +446,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
......@@ -490,25 +460,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
......@@ -546,26 +505,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
}
}
/*
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d PmeMIMult\n", jj );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
*/
(void) fflush( amoebaGpu->log );
iteration++;
}
delete debugArray;
#endif
}
......@@ -647,28 +590,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
while( !done ){
// matrix multiply
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
/*
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
*/
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePostPmeDirectMI", fileId, outputVector );
}
#endif
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
......@@ -678,20 +602,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField");
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
}
#endif
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
......@@ -785,12 +695,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log );
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
(void) fflush( amoebaGpu->log );
}
#endif
// exit if nan
......
......@@ -90,23 +90,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
fieldPolarSum[1] = 0.0f;
fieldPolarSum[2] = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
if (x == y ){
// load shared data
loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
for (unsigned int j = 0; j < GRID; j++)
{
// load coords, charge, ...
float4 delta;
float prefactor2;
for (unsigned int j = 0; j < GRID; j++) {
if( ( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ) ){
float4 delta;
float prefactor2;
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
//delta.w = prefactor2 = 0.0f;
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
}
......@@ -123,19 +117,15 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
} else {
if (lasty != y)
{
if( lasty != y ){
unsigned int atomJ = y + tgx;
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
if( flags != 0 ){
#ifndef INCLUDE_MI_FIELD_BUFFERS
flags = 0xFFFFFFFF;
......@@ -194,7 +184,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
}
}
tj = (tj + 1) & (GRID - 1);
tj = (tj + 1) & (GRID - 1);
} // end of j-loop
......
......@@ -470,8 +470,15 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
//compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevData, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
//compactStream( gpu->compactPlan,
// gpu->sim.pInteractingWorkUnit, unsigned int* dOut
// amoebaGpu->psWorkUnit->_pDevData, const unsigned int* dIn
// gpu->sim.pInteractionFlag, const unsigned int* dValid
// gpu->sim.workUnits, gpu
// gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
......
......@@ -10,6 +10,7 @@
#include "amoebaScaleFactors.h"
#include <stdio.h>
extern int isNanOrInfinity( double number );
using namespace std;
......@@ -594,6 +595,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
gpu->psInteractionFlag->Download();
amoebaGpu->psVdwWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii];
......@@ -609,6 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
}
(void) fflush( amoebaGpu->log );
}
#endif
......
......@@ -398,12 +398,12 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData );
gpu->psWorkUnit->_pDevData );
} else {
kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData );
gpu->psWorkUnit->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaWcaDispersion");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment