Commit 548efdfb authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removed amoebaGpu.psWorkUnit stream

Fixed problem w/ AmoebaVdwForce.areParticlesIdentical() that was preventing particles from being reordered
parent cd93e5e0
...@@ -55,6 +55,7 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo ...@@ -55,6 +55,7 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext; _gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){ if (data.cudaPlatformData.nonbondedMethod != NO_CUTOFF && data.cudaPlatformData.computeForceCount%100 == 0){
//fprintf( stderr, "In CalcAmoebaForcesAndEnergyKernel::beginComputation reordering atoms\n" ); fflush( stderr );
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
} }
...@@ -72,7 +73,6 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo ...@@ -72,7 +73,6 @@ void CalcAmoebaForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo
} }
double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) { double CalcAmoebaForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy) {
//fprintf( stderr, "IN CalcAmoebaForcesAndEnergyKernel::finishComputation\n" ); fflush( stderr );
amoebaGpuContext amoebaGpu = data.getAmoebaGpu(); amoebaGpuContext amoebaGpu = data.getAmoebaGpu();
_gpuContext* gpu = data.getAmoebaGpu()->gpuContext; _gpuContext* gpu = data.getAmoebaGpu()->gpuContext;
...@@ -865,14 +865,19 @@ public: ...@@ -865,14 +865,19 @@ public:
vector<double> dipole1, dipole2, quadrupole1, quadrupole2; vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1); force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2); force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2) if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2){
return false; return false;
for (int i = 0; i < (int) dipole1.size(); ++i) }
if (dipole1[i] != dipole2[i]) for (int i = 0; i < (int) dipole1.size(); ++i){
if (dipole1[i] != dipole2[i]){
return false; return false;
for (int i = 0; i < (int) quadrupole1.size(); ++i) }
if (quadrupole1[i] != quadrupole2[i]) }
for (int i = 0; i < (int) quadrupole1.size(); ++i){
if (quadrupole1[i] != quadrupole2[i]){
return false; return false;
}
}
return true; return true;
} }
private: private:
...@@ -1142,7 +1147,7 @@ public: ...@@ -1142,7 +1147,7 @@ public:
double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2; double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
force.getParticleParameters(particle1, iv1, class1, sigma1, epsilon1, reduction1); force.getParticleParameters(particle1, iv1, class1, sigma1, epsilon1, reduction1);
force.getParticleParameters(particle2, iv2, class2, sigma2, epsilon2, reduction2); force.getParticleParameters(particle2, iv2, class2, sigma2, epsilon2, reduction2);
return (iv1 == iv2 && class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2); return (class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
} }
private: private:
const AmoebaVdwForce& force; const AmoebaVdwForce& force;
......
...@@ -45,9 +45,11 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu); ...@@ -45,9 +45,11 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
#include <limits> #include <limits>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
#include <stdio.h>
#ifdef WIN32 #ifdef WIN32
// #include <windows.h> #include <windows.h>
#else
#include <sys/time.h>
#endif #endif
#define DUMP_PARAMETERS 0 #define DUMP_PARAMETERS 0
...@@ -235,7 +237,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log ) ...@@ -235,7 +237,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, "\n\n" ); (void) fprintf( log, "\n\n" );
totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psWorkUnit, log ); totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->gpuContext->psWorkUnit, log );
totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psScalingIndicesIndex, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->psScalingIndicesIndex, log );
totalMemory += gpuPrintCudaStreamInt( amoebaGpu->ps_D_ScaleIndices, log ); totalMemory += gpuPrintCudaStreamInt( amoebaGpu->ps_D_ScaleIndices, log );
totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log ); totalMemory += gpuPrintCudaStreamInt2( amoebaGpu->ps_P_ScaleIndices, log );
...@@ -2709,7 +2711,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu) ...@@ -2709,7 +2711,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete gpu->psWorkArray_1_1; delete gpu->psWorkArray_1_1;
delete gpu->psWorkArray_1_2; delete gpu->psWorkArray_1_2;
delete gpu->psWorkUnit;
delete gpu->psScalingIndicesIndex; delete gpu->psScalingIndicesIndex;
delete gpu->ps_D_ScaleIndices; delete gpu->ps_D_ScaleIndices;
delete gpu->ps_P_ScaleIndices; delete gpu->ps_P_ScaleIndices;
...@@ -2852,7 +2853,7 @@ extern "C" ...@@ -2852,7 +2853,7 @@ extern "C"
int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu ) int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu )
{ {
if( amoebaGpu->psWorkUnit != NULL ){ if( amoebaGpu->psVdwWorkUnit != NULL ){
return 0; return 0;
} }
const unsigned int atoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; const unsigned int atoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
...@@ -2860,10 +2861,21 @@ int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu ) ...@@ -2860,10 +2861,21 @@ int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu )
const unsigned int dim = (atoms + (grid - 1)) / grid; const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2; const unsigned int cells = dim * (dim + 1) / 2;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit"); CUDAStream<unsigned int>* psWorkUnit;
if( amoebaGpu->gpuContext->psWorkUnit == NULL ){
psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
amoebaGpu->gpuContext->psWorkUnit = psWorkUnit;
} else {
psWorkUnit = amoebaGpu->gpuContext->psWorkUnit;
if( psWorkUnit->_length < cells ){
delete psWorkUnit;
psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
amoebaGpu->gpuContext->psWorkUnit = psWorkUnit;
}
}
unsigned int* pWorkList = psWorkUnit->_pSysData; unsigned int* pWorkList = psWorkUnit->_pSysData;
amoebaGpu->psWorkUnit = psWorkUnit; memset( psWorkUnit->_pSysData, 0, cells*sizeof( unsigned int) );
memset( amoebaGpu->psWorkUnit->_pSysData, 0, cells*sizeof( unsigned int) );
CUDAStream<unsigned int>* psVdwWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "VdwWorkUnit"); CUDAStream<unsigned int>* psVdwWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "VdwWorkUnit");
unsigned int* pVdwWorkList = psVdwWorkUnit->_pSysData; unsigned int* pVdwWorkList = psVdwWorkUnit->_pSysData;
...@@ -2909,7 +2921,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -2909,7 +2921,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
const unsigned int grid = amoebaGpu->gpuContext->grid; const unsigned int grid = amoebaGpu->gpuContext->grid;
const unsigned int dim = paddedAtoms/grid; const unsigned int dim = paddedAtoms/grid;
const unsigned int cells = dim * (dim + 1) / 2; const unsigned int cells = dim * (dim + 1) / 2;
unsigned int* pWorkList = amoebaGpu->psWorkUnit->_pSysData; unsigned int* pWorkList = amoebaGpu->gpuContext->psWorkUnit->_pSysData;
// minCellIndex & maxCellIndex track min/max atom index for each cell // minCellIndex & maxCellIndex track min/max atom index for each cell
...@@ -2981,7 +2993,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu ) ...@@ -2981,7 +2993,7 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
int xAtomMin = x*grid; int xAtomMin = x*grid;
int xAtomMax = xAtomMin + gridOffset; int xAtomMax = xAtomMin + gridOffset;
if( (maxCellIndex[y] >= xAtomMin && minCellIndex[y] <= xAtomMax) || (x == lastBlock || y == lastBlock) ){ if( (maxCellIndex[y] >= xAtomMin && minCellIndex[y] <= xAtomMax) || (x == lastBlock || y == lastBlock) ){
pWorkList[ii] = encodeCellExclusion( pWorkList[ii] ); pWorkList[ii] = encodeCellExclusion( pWorkList[ii] );
psScalingIndicesIndex->_pSysData[ii] = numWithScalingIndices*grid; psScalingIndicesIndex->_pSysData[ii] = numWithScalingIndices*grid;
numWithScalingIndices++; numWithScalingIndices++;
//(void) fprintf( amoebaGpu->log, "%5d [%6d %6d] [%6d %6d] [%6d %6d] num=%5d last=%5d\n", //(void) fprintf( amoebaGpu->log, "%5d [%6d %6d] [%6d %6d] [%6d %6d] num=%5d last=%5d\n",
...@@ -3446,7 +3458,7 @@ tgx = 0; ...@@ -3446,7 +3458,7 @@ tgx = 0;
amoebaGpu->ps_P_ScaleIndices->Upload(); amoebaGpu->ps_P_ScaleIndices->Upload();
amoebaGpu->ps_M_ScaleIndices->Upload(); amoebaGpu->ps_M_ScaleIndices->Upload();
amoebaGpu->psScalingIndicesIndex->Upload(); amoebaGpu->psScalingIndicesIndex->Upload();
amoebaGpu->psWorkUnit->Upload(); amoebaGpu->gpuContext->psWorkUnit->Upload();
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -4501,3 +4513,40 @@ if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e ...@@ -4501,3 +4513,40 @@ if( ii == 0 )(void) fprintf( stderr, "reduceAndCopyCUDAStreamFloat:%u %d %15.7e
outputStream->Upload(); outputStream->Upload();
} }
/**
* Get time of day (implementation different for Linux/Windows
*
* @return time
*
*/
double getTimeOfDay( void ){
#ifdef WIN32
static double cycles_per_usec = 0;
LARGE_INTEGER counter;
if (cycles_per_usec == 0) {
static LARGE_INTEGER lFreq;
if (!QueryPerformanceFrequency(&lFreq)) {
fprintf(stderr, "Unable to read the performance counter frquency!\n");
return 0;
}
cycles_per_usec = 1000000 / ((double) lFreq.QuadPart);
}
if (!QueryPerformanceCounter(&counter)) {
fprintf(stderr,"Unable to read the performance counter!\n");
return 0;
}
double time = ((((double) counter.QuadPart) * cycles_per_usec));
return time*1.0e-06;
#else
struct timeval tv;
gettimeofday(&tv,NULL);
return static_cast<double>(tv.tv_sec) + 1.0e-06*static_cast<double>(tv.tv_usec);
#endif
}
...@@ -175,7 +175,8 @@ extern void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu); ...@@ -175,7 +175,8 @@ extern void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu);
extern void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu); extern void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu);
extern void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu); extern void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu);
extern void SetCalculateAmoebaCudaUtilitiesSim( amoebaGpuContext amoebaGpu ); extern void SetCalculateAmoebaCudaUtilitiesSim( amoebaGpuContext amoebaGpu );
double getTimeOfDay( void );
#endif //__AMOEBA_GPU_TYPES_H__ #endif //__AMOEBA_GPU_TYPES_H__
...@@ -90,7 +90,6 @@ struct _amoebaGpuContext { ...@@ -90,7 +90,6 @@ struct _amoebaGpuContext {
CUDAStream<float>* psWorkArray_1_1; CUDAStream<float>* psWorkArray_1_1;
CUDAStream<float>* psWorkArray_1_2; CUDAStream<float>* psWorkArray_1_2;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<int>* psScalingIndicesIndex; CUDAStream<int>* psScalingIndicesIndex;
CUDAStream<int>* ps_D_ScaleIndices; CUDAStream<int>* ps_D_ScaleIndices;
CUDAStream<int2>* ps_P_ScaleIndices; CUDAStream<int2>* ps_P_ScaleIndices;
......
...@@ -679,10 +679,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo ...@@ -679,10 +679,10 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData ); gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} else { } else {
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData ); gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} }
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
......
...@@ -378,7 +378,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -378,7 +378,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
...@@ -390,7 +390,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu ) ...@@ -390,7 +390,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
} else { } else {
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
......
...@@ -119,7 +119,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -119,7 +119,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
...@@ -130,7 +130,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -130,7 +130,7 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
} else { } else {
kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
......
...@@ -1946,7 +1946,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1946,7 +1946,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom ); , debugArray->_pDevData, targetAtom );
#else #else
...@@ -1955,7 +1955,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu ) ...@@ -1955,7 +1955,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
} else { } else {
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom ); , debugArray->_pDevData, targetAtom );
#else #else
......
...@@ -973,12 +973,12 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu ) ...@@ -973,12 +973,12 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData ); gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} else { } else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>( kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData ); gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} }
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces"); LAUNCHERROR("kCalculateAmoebaCudaKirkwoodEDiffN2Forces");
......
...@@ -505,7 +505,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -505,7 +505,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
...@@ -518,7 +518,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -518,7 +518,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
} else { } else {
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psWorkArray_3_3->_pDevData,
......
...@@ -289,7 +289,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -289,7 +289,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
...@@ -301,7 +301,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -301,7 +301,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
} else { } else {
kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData, gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
......
...@@ -37,7 +37,7 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu) ...@@ -37,7 +37,7 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS #undef INCLUDE_MI_FIELD_BUFFERS
//#define INCLUDE_MI_FIELD_BUFFERS #define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h" #include "kCalculateAmoebaCudaMutualInducedParticle.h"
#ifdef INCLUDE_MI_FIELD_BUFFERS #ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){ __device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
...@@ -141,12 +141,7 @@ __device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float indu ...@@ -141,12 +141,7 @@ __device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float indu
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field // file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ, __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float4 fields[3] float uscale, float4 fields[3] ){
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
){
// compute the real space portion of the Ewald summation // compute the real space portion of the Ewald summation
...@@ -254,27 +249,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce ...@@ -254,27 +249,6 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
fields[2].z = 0.0f; fields[2].z = 0.0f;
fields[2].w = 0.0f; fields[2].w = 0.0f;
} }
/*
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
pullBack[0].z = zr;
pullBack[0].w = r2;
pullBack[1].x = alsq2;
pullBack[1].y = bn0;
pullBack[1].z = bn2;
pullBack[1].w = exp2a;
pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z;
pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
pullBack[1].x = scale3;
pullBack[1].y = scale5;
pullBack[1].z = scale7;
#endif
*/
} }
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
...@@ -453,11 +427,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -453,11 +427,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
(void) fprintf( amoebaGpu->log, "%s\n", methodName ); (void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
#endif #endif
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
...@@ -477,6 +446,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -477,6 +446,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){ if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n", (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock, sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
...@@ -490,25 +460,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -490,25 +460,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else { } else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField"); LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
...@@ -546,26 +505,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -546,26 +505,10 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
} }
} }
/*
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d PmeMIMult\n", jj );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
*/
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
iteration++; iteration++;
} }
delete debugArray;
#endif #endif
} }
...@@ -647,28 +590,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -647,28 +590,9 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
while( !done ){ while( !done ){
// matrix multiply // matrix multiply
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] ); cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu ); kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
/*
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
*/
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePostPmeDirectMI", fileId, outputVector );
}
#endif
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
...@@ -678,20 +602,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -678,20 +602,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData ); amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField"); LAUNCHERROR("kSorUpdatePmeMutualInducedField");
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
}
#endif
// get total epsilon -- performing sums on gpu // get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>( kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
...@@ -785,12 +695,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -785,12 +695,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pSysData[2], done ); amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
(void) fflush( amoebaGpu->log );
}
#endif #endif
// exit if nan // exit if nan
......
...@@ -90,23 +90,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -90,23 +90,17 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
fieldPolarSum[1] = 0.0f; fieldPolarSum[1] = 0.0f;
fieldPolarSum[2] = 0.0f; fieldPolarSum[2] = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y ){
{
// load shared data // load shared data
loadMutualInducedShared( &(sA[threadIdx.x]), atomI ); loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++) {
{
// load coords, charge, ...
float4 delta;
float prefactor2;
if( ( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ) ){ if( ( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ) ){
float4 delta;
float prefactor2;
setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 ); setupMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, &delta, &prefactor2 );
//delta.w = prefactor2 = 0.0f;
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum ); calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipole, delta, prefactor2, fieldSum );
calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum ); calculateMutualInducedFieldPairIxn_kernel( psA[j].inducedDipolePolar, delta, prefactor2, fieldPolarSum );
} }
...@@ -123,19 +117,15 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -123,19 +117,15 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
load3dArray( offset, fieldSum, outputField ); load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar); load3dArray( offset, fieldPolarSum, outputFieldPolar);
} else { } else {
if (lasty != y) if( lasty != y ){
{
unsigned int atomJ = y + tgx; unsigned int atomJ = y + tgx;
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ ); loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
} }
unsigned int flags = cSim.pInteractionFlag[pos]; unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) { if( flags != 0 ){
// No interactions in this block.
} else {
#ifndef INCLUDE_MI_FIELD_BUFFERS #ifndef INCLUDE_MI_FIELD_BUFFERS
flags = 0xFFFFFFFF; flags = 0xFFFFFFFF;
...@@ -194,7 +184,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -194,7 +184,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
} }
} }
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
} // end of j-loop } // end of j-loop
......
...@@ -470,8 +470,15 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -470,8 +470,15 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
LAUNCHERROR("kFindBlockBoundsPeriodic"); LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic"); LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
//compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevData, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
//compactStream( gpu->compactPlan,
// gpu->sim.pInteractingWorkUnit, unsigned int* dOut
// amoebaGpu->psWorkUnit->_pDevData, const unsigned int* dIn
// gpu->sim.pInteractionFlag, const unsigned int* dValid
// gpu->sim.workUnits, gpu
// gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic"); LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "amoebaScaleFactors.h" #include "amoebaScaleFactors.h"
#include <stdio.h> #include <stdio.h>
extern int isNanOrInfinity( double number ); extern int isNanOrInfinity( double number );
using namespace std; using namespace std;
...@@ -594,6 +595,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -594,6 +595,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
gpu->psInteractionFlag->Download(); gpu->psInteractionFlag->Download();
amoebaGpu->psVdwWorkUnit->Download(); amoebaGpu->psVdwWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] ); (void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){ for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii]; unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii];
...@@ -609,6 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -609,6 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions ); (void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
} }
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
} }
#endif #endif
......
...@@ -398,12 +398,12 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu ) ...@@ -398,12 +398,12 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>( kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData ); gpu->psWorkUnit->_pDevData );
} else { } else {
kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>( kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData ); gpu->psWorkUnit->_pDevData );
} }
LAUNCHERROR("kCalculateAmoebaWcaDispersion"); LAUNCHERROR("kCalculateAmoebaWcaDispersion");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment