Commit 2b508482 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Added copyright

Removed debugging code
parent 36762962
...@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float* outputField, float* outputFieldPolar float* outputField, float* outputFieldPolar){
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
extern __shared__ MutualInducedParticle sA[]; extern __shared__ MutualInducedParticle sA[];
...@@ -99,11 +95,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -99,11 +95,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
// load coords, charge, ... // load coords, charge, ...
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField);
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1; unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
...@@ -117,34 +109,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -117,34 +109,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
fieldPolarSum[1] += mask ? ijField[1][1] : 0.0f; fieldPolarSum[1] += mask ? ijField[1][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[1][2] : 0.0f; fieldPolarSum[2] += mask ? ijField[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom ){
unsigned int index = y + j;
unsigned int indexI = 0;
//unsigned int indexJ = 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
//debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].z = (float) cSim.atoms;
debugArray[index].w = (float) (mask + 1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) 1.0f;
}
#endif
} }
// Write results // Write results
...@@ -161,9 +125,8 @@ if( atomI == targetAtom ){ ...@@ -161,9 +125,8 @@ if( atomI == targetAtom ){
#endif #endif
} } else {
else // 100% utilization
{
// Read fixed atom data into registers and GRF // Read fixed atom data into registers and GRF
if (lasty != y) if (lasty != y)
{ {
...@@ -185,11 +148,7 @@ if( atomI == targetAtom ){ ...@@ -185,11 +148,7 @@ if( atomI == targetAtom ){
// load coords, charge, ... // load coords, charge, ...
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField);
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1; unsigned int mask = ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
...@@ -217,34 +176,6 @@ if( atomI == targetAtom ){ ...@@ -217,34 +176,6 @@ if( atomI == targetAtom ){
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f; psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f; psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
#ifdef AMOEBA_DEBUG
//#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
//unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].w = (float) (mask+1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) -1.0f;
}
#endif
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
......
//----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h" #include "amoebaGpuTypes.h"
#include "cudaKernels.h" #include "cudaKernels.h"
......
///----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h" #include "amoebaGpuTypes.h"
#include "cudaKernels.h" #include "cudaKernels.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h" #include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
...@@ -213,11 +233,7 @@ __device__ void calculateBn_kernel( float r, float4* bn, float* bn0, float *bn5 ...@@ -213,11 +233,7 @@ __device__ void calculateBn_kernel( float r, float4* bn, float* bn0, float *bn5
#undef SUB_METHOD_NAME #undef SUB_METHOD_NAME
__device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ, __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float* scalingFactors, float4 forceTorqueEnergy[3] const float* scalingFactors, float4 forceTorqueEnergy[3]){
#ifdef AMOEBA_DEBUG
,float4* debugArray
#endif
){
float xr = atomJ.x - atomI.x; float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y; float yr = atomJ.y - atomI.y;
...@@ -909,132 +925,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire ...@@ -909,132 +925,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire
forceTorqueEnergy[2].y = (ttm32 + ttm3i2); forceTorqueEnergy[2].y = (ttm32 + ttm3i2);
forceTorqueEnergy[2].z = (ttm33 + ttm3i3); forceTorqueEnergy[2].z = (ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG
int debugIndex = 0;
float idTracker = 1.0f;
/*
debugArray[debugIndex].x = atomI.labFrameDipole[0];
debugArray[debugIndex].y = atomI.labFrameDipole[1];
debugArray[debugIndex].z = atomI.labFrameDipole[2];
debugArray[debugIndex].w = r2;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.labFrameDipole[0];
debugArray[debugIndex].y = atomJ.labFrameDipole[1];
debugArray[debugIndex].z = atomJ.labFrameDipole[2];
debugArray[debugIndex].w = cSim.alphaEwald;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomI.inducedDipole[0];
debugArray[debugIndex].y = atomI.inducedDipole[1];
debugArray[debugIndex].z = atomI.inducedDipole[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.inducedDipole[0];
debugArray[debugIndex].y = atomJ.inducedDipole[1];
debugArray[debugIndex].z = atomJ.inducedDipole[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomI.inducedDipoleP[0];
debugArray[debugIndex].y = atomI.inducedDipoleP[1];
debugArray[debugIndex].z = atomI.inducedDipoleP[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.inducedDipoleP[0];
debugArray[debugIndex].y = atomJ.inducedDipoleP[1];
debugArray[debugIndex].z = atomJ.inducedDipoleP[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = conversionFactor*ftm21;
debugArray[debugIndex].y = conversionFactor*ftm22;
debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = e;
debugArray[debugIndex].y = ei;
debugArray[debugIndex].z = erl;
debugArray[debugIndex].w = erli;
debugIndex++;
*/
idTracker += 100.0;
debugArray[debugIndex].x = r2;
debugArray[debugIndex].y = cSim.alphaEwald;
debugArray[debugIndex].z = conversionFactor;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm21;
debugArray[debugIndex].y = conversionFactor*ftm22;
debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm2i1;
debugArray[debugIndex].y = conversionFactor*ftm2i2;
debugArray[debugIndex].z = conversionFactor*ftm2i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
/*
idTracker += 100.0;
debugArray[debugIndex].x = fridmp1;
debugArray[debugIndex].y = fridmp2;
debugArray[debugIndex].z = fridmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = findmp1;
debugArray[debugIndex].y = findmp2;
debugArray[debugIndex].z = findmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
*/
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm21;
debugArray[debugIndex].y = conversionFactor*ttm22;
debugArray[debugIndex].z = conversionFactor*ttm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm2i1;
debugArray[debugIndex].y = conversionFactor*ttm2i2;
debugArray[debugIndex].z = conversionFactor*ttm2i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm31;
debugArray[debugIndex].y = conversionFactor*ttm32;
debugArray[debugIndex].z = conversionFactor*ttm33;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm3i1;
debugArray[debugIndex].y = conversionFactor*ttm3i2;
debugArray[debugIndex].z = conversionFactor*ttm3i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
#endif
} else { } else {
forceTorqueEnergy[0].x = 0.0f; forceTorqueEnergy[0].x = 0.0f;
...@@ -1051,15 +941,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire ...@@ -1051,15 +941,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire
forceTorqueEnergy[0].w = 0.0f; forceTorqueEnergy[0].w = 0.0f;
#ifdef AMOEBA_DEBUG
for( int ii = 0; ii < 12; ii++ ){
debugArray[ii].x = 0.0f;
debugArray[ii].y = 0.0f;
debugArray[ii].z = 0.0f;
debugArray[ii].w = (float) (-ii);
}
#endif
} }
return; return;
...@@ -1351,35 +1232,11 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ) ...@@ -1351,35 +1232,11 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
{ {
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers // apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxOffset = 20;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxOffset*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxOffset*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 49;
#endif
// on first pass, set threads/block // on first pass, set threads/block
static unsigned int threadsPerBlock = 0; static unsigned int threadsPerBlock = 0;
...@@ -1403,37 +1260,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu ) ...@@ -1403,37 +1260,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 1 ); kClearFields_3( amoebaGpu, 1 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u maxL1=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, maxL1 );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else { } else {
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces"); LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
......
//----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h" #include "cudaKernels.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h" #include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
...@@ -171,12 +191,7 @@ __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& at ...@@ -171,12 +191,7 @@ __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& at
} }
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ, __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float4 fields[3] float dscale, float pscale, float4 fields[3]){
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
){
// compute the real space portion of the Ewald summation // compute the real space portion of the Ewald summation
...@@ -329,18 +344,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -329,18 +344,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields[2].w = 0.0f; fields[2].w = 0.0f;
} }
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
pullBack[0].z = zr;
pullBack[0].w = r2;
pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z;
pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
#endif
} }
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
...@@ -361,40 +364,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& ...@@ -361,40 +364,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
#ifdef AMOEBA_DEBUG
static int isNanOrInfinity( double number ){
return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0;
}
static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
{
int i, j, temp;
int test; /*use this only if unsure whether the list is already sorted or not*/
for(i = length - 1; i > 0; i--)
{
test=0;
for(j = 0; j < i; j++)
{
if(array[j] > array[j+1]) /* compare neighboring elements */
{
temp = array[j]; /* swap array[j] and array[j+1] */
array[j] = array[j+1];
array[j+1] = temp;
temp = track[j]; /* swap array[j] and array[j+1] */
track[j] = track[j+1];
track[j+1] = temp;
test=1;
}
} /*end for j*/
if(test==0) break; /*will exit if the list is sorted!*/
} /*end for i*/
}
#endif
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute fixed electric field using PME Compute fixed electric field using PME
...@@ -409,22 +378,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -409,22 +378,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
static unsigned int threadsPerBlock = 0; static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
static const char* methodName = "computeCudaAmoebaPmeFixedEField";
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int slots = 15;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*slots, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*slots);
debugArray->Upload();
// print intermediate results for the targetAtom
unsigned int targetAtom = 0;
#endif
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block // on first pass, set threads/block
...@@ -444,175 +397,17 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -444,175 +397,17 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else { } else {
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData ); amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel"); LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
kReducePmeDirectE_Fields( amoebaGpu ); kReducePmeDirectE_Fields( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log );
/*
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n", gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// buffer 1
(void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset],
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+1],
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+2] );
// buffer 2
(void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset],
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+1],
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
*/
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
int maxPrint = 3002;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download();
std::vector<int> indices;
std::vector<int> track;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
int orderIndex = gpu->psAtomIndex->_pSysData[jj];
indices.push_back( orderIndex );
track.push_back( jj );
}
}
bubbleSort( indices, track, static_cast<int>(track.size()) );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
amoebaGpu->gpuContext->psPosq4->Download();
unsigned int count = 0;
float sum0[3] = { 0.0f, 0.0f, 0.0f };
float sum1[3] = { 0.0f, 0.0f, 0.0f };
int offset0 = 1;
int offset1 = 2;
/*
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
int orderIndex = gpu->psAtomIndex->_pSysData[jj];
count++;
*/
for( unsigned int ii = 0; ii < track.size(); ii++ ){
int jj = track[ii];
int debugIndex = jj;
int orderIndex = indices[ii];
if( orderIndex > 31 && offset0 == 1 ){
offset0 = 2;
offset1 = 2;
}
count++;
sum0[0] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].x;
sum0[1] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].y;
sum0[2] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].z;
sum1[0] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].x;
sum1[1] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].y;
sum1[2] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].z;
(void) fprintf( amoebaGpu->log,"%5d %5d %u PmeFixedEField\n", orderIndex, jj, count );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"%6d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %6d %6d cum sumsOp\n",
orderIndex, sum0[0], sum0[1], sum0[2], sum1[0], sum1[1], sum1[2], jj, count );
(void) fprintf( amoebaGpu->log,"\n" );
}
// write results to file
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
delete debugArray;
}
#endif
} }
void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
...@@ -621,21 +416,4 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu ) ...@@ -621,21 +416,4 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
kCalculateAmoebaPMEFixedMultipoles( amoebaGpu ); kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu ); cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
//cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
exit(0);
}
#endif
} }
...@@ -37,16 +37,7 @@ __launch_bounds__(64, 1) ...@@ -37,16 +37,7 @@ __launch_bounds__(64, 1)
void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float* outputEField, float* outputEField,
float* outputEFieldPolar float* outputEFieldPolar){
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int maxPullIndex = 1;
float4 pullBack[12];
#endif
extern __shared__ FixedFieldParticle sA[]; extern __shared__ FixedFieldParticle sA[];
...@@ -118,11 +109,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( ...@@ -118,11 +109,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
} }
float4 ijField[3]; float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField);
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag // by setting match flag
...@@ -139,66 +126,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)( ...@@ -139,66 +126,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
fieldPolarSum[1] += match ? 0.0f : ijField[1].z; fieldPolarSum[1] += match ? 0.0f : ijField[1].z;
fieldPolarSum[2] += match ? 0.0f : ijField[2].z; fieldPolarSum[2] += match ? 0.0f : ijField[2].z;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || targetAtom == (y+j) ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
float flag = 7.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].z;
debugArray[index].y = match ? 0.0f : ijField[1].z;
debugArray[index].z = match ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].y;
debugArray[index].y = match ? 0.0f : ijField[1].y;
debugArray[index].z = match ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 3.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].w;
debugArray[index].y = match ? 0.0f : ijField[1].w;
debugArray[index].z = match ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
} }
// Write results // Write results
...@@ -252,11 +179,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){ ...@@ -252,11 +179,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
} }
float4 ijField[3]; float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField);
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int outOfBounds = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0; unsigned int outOfBounds = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;
...@@ -317,65 +240,6 @@ if( atomI == targetAtom || targetAtom == (y+j) ){ ...@@ -317,65 +240,6 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
} }
} }
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
float flag = 9.0f;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].x;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].x;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].y;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].y;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].z;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].z;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 3.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].w;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].w;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
} }
tj = (tj + 1) & (GRID - 1); tj = (tj + 1) & (GRID - 1);
......
//----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h" #include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
...@@ -33,9 +55,6 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu) ...@@ -33,9 +55,6 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
} }
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS #undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS #define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h" #include "kCalculateAmoebaCudaMutualInducedParticle.h"
...@@ -331,10 +350,6 @@ static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* ar ...@@ -331,10 +350,6 @@ static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* ar
{ {
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y; epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) ); epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
#endif
} }
} }
...@@ -419,16 +434,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -419,16 +434,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
static unsigned int threadsPerBlock = 0; static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
int targetAtom = 546;
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log );
}
#endif
kClearFields_3( amoebaGpu, 2 ); kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block // on first pass, set threads/block
...@@ -444,17 +449,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -444,17 +449,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
} }
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
...@@ -474,43 +468,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte ...@@ -474,43 +468,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray ); kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d -- Direct only -- self added in kSorUpdateMutualInducedField_kernel\n",
iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download();
outputPolarArray->Download();
//debugArray->Download();
int maxPrint = 5;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// MI
(void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysData[indexOffset],
outputArray->_pSysData[indexOffset+1],
outputArray->_pSysData[indexOffset+2] );
// MI polar
(void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
outputPolarArray->_pSysData[indexOffset],
outputPolarArray->_pSysData[indexOffset+1],
outputPolarArray->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
iteration++;
}
#endif
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -526,19 +483,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -526,19 +483,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
int done; int done;
int iteration; int iteration;
...@@ -559,19 +503,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -559,19 +503,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice ); cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeEFieldPolarity", fileId, outputVector );
}
#endif
// if polarization type is direct, set flags signalling done and return // if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType ) if( amoebaGpu->amoebaSim.polarizationType )
...@@ -609,12 +540,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -609,12 +540,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pDevData ); amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta"); LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
trackMutualInducedIterations( amoebaGpu, iteration);
}
#endif
// Debye=48.033324f // Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download(); amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0]; float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
...@@ -624,79 +549,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -624,79 +549,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
done = 1; done = 1;
} }
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
#if 1
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#else
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[0],
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#endif
(void) fflush( amoebaGpu->log );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
/*
int offset = 0;
int maxPrint = 10;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysData[offset],
amoebaGpu->psInducedDipole->_pSysData[offset+1],
amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysData[offset],
amoebaGpu->psInducedDipolePolar->_pSysData[offset+1],
amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
*/
if( 0 ){
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
}
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log );
#endif
// exit if nan // exit if nan
if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){ if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
...@@ -710,25 +562,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -710,25 +562,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done; amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1; amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
if( 0 ){
static int iteration = 0;
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData, ++iteration, "CudaPmeMI", stderr );
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr );
}
#endif
// ---------------------------------------------------------------------------------------
} }
void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
......
...@@ -37,9 +37,6 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -37,9 +37,6 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float* outputField, float* outputFieldPolar float* outputField, float* outputFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){ ){
extern __shared__ MutualInducedParticle sA[]; extern __shared__ MutualInducedParticle sA[];
...@@ -52,10 +49,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)( ...@@ -52,10 +49,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
const float uscale = 1.0f; const float uscale = 1.0f;
#ifdef AMOEBA_DEBUG
float4 pullBack[4];
#endif
while (pos < end) while (pos < end)
{ {
......
//----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h" #include "cudaKernels.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
...@@ -51,8 +73,6 @@ __device__ static float normVector3( float* vector ) ...@@ -51,8 +73,6 @@ __device__ static float normVector3( float* vector )
return returnNorm; return returnNorm;
} }
#undef AMOEBA_DEBUG
// ZThenX == 0 // ZThenX == 0
// Bisector == 1 // Bisector == 1
// ZBisect == 2 // ZBisect == 2
...@@ -379,10 +399,6 @@ void kCudaComputeLabFrameMoments_kernel( void ) ...@@ -379,10 +399,6 @@ void kCudaComputeLabFrameMoments_kernel( void )
void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
{ {
// ---------------------------------------------------------------------------------------
static const char* methodName = "computeCudaAmoebaLabFrameMoments";
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
...@@ -390,33 +406,6 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -390,33 +406,6 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
int numBlocks = gpu->sim.blocks; int numBlocks = gpu->sim.blocks;
int numThreads = gpu->sim.threads_per_block; int numThreads = gpu->sim.threads_per_block;
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download();
amoebaGpu->psMolecularDipole->Download();
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Download();
gpu->psPosq4->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){
int mIndex = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s [%6d %6d %6d %6d]\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : ""),
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].w );
//if( ii == 30 )ii = gpu->natoms - 30;
}
}
#endif
// copy molecular moments to lab frame moment arrays // copy molecular moments to lab frame moment arrays
// check if chiral center requires moments to have sign flipped // check if chiral center requires moments to have sign flipped
// compute lab frame moments // compute lab frame moments
...@@ -428,7 +417,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu ) ...@@ -428,7 +417,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCudaComputeCheckChiral"); LAUNCHERROR("kCudaComputeCheckChiral");
kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( ); kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( );
LAUNCHERROR(methodName); LAUNCHERROR("kCudaComputeLabFrameMoments");
} }
...@@ -505,5 +494,3 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -505,5 +494,3 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
} }
} }
#undef AMOEBA_DEBUG
/* -------------------------------------------------------------------------- *
//----------------------------------------------------------------------------------------- * OpenMM *
* -------------------------------------------------------------------------- *
//----------------------------------------------------------------------------------------- * This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim; static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......
//----------------------------------------------------------------------------------------- /* -------------------------------------------------------------------------- *
* OpenMM *
//----------------------------------------------------------------------------------------- * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h" #include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
...@@ -38,12 +60,6 @@ void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext amoebaGpu) ...@@ -38,12 +60,6 @@ void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaVdw14_7Sim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed"); RTERROR(status, "GetCalculateAmoebaCudaVdw14_7Sim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
} }
//#define AMOEBA_DEBUG_PRINT
#undef AMOEBA_DEBUG_PRINT
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
__device__ void zeroVdw14_7SharedForce( struct Vdw14_7Particle* sA ) __device__ void zeroVdw14_7SharedForce( struct Vdw14_7Particle* sA )
{ {
// zero shared fields // zero shared fields
...@@ -101,11 +117,7 @@ __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule, ...@@ -101,11 +117,7 @@ __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule,
} }
__device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float combindedEpsilon, __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float combindedEpsilon,
float force[3], float* energy float force[3], float* energy)
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
{ {
const float deltaHalM1 = 0.07f; const float deltaHalM1 = 0.07f;
...@@ -120,14 +132,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c ...@@ -120,14 +132,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c
float r2 = force[0]*force[0] + force[1]*force[1] + force[2]*force[2]; float r2 = force[0]*force[0] + force[1]*force[1] + force[2]*force[2];
if( r2 > cAmoebaSim.vdwCutoff2 ){ if( r2 > cAmoebaSim.vdwCutoff2 ){
*energy = force[0] = force[1] = force[2] = 0.0f; *energy = force[0] = force[1] = force[2] = 0.0f;
#ifdef AMOEBA_DEBUG
float rI = rsqrtf( r2 );
float r = 1.0f/rI;
debugArray[0].x = r;
debugArray[0].y = debugArray[0].z = debugArray[0].w = 0.0f;
debugArray[1].x = debugArray[1].y = debugArray[1].z = 0.0f;
debugArray[1].w = r;
#endif
return; return;
} }
float rI = rsqrtf( r2 ); float rI = rsqrtf( r2 );
...@@ -156,17 +160,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c ...@@ -156,17 +160,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c
force[1] *= deltaE; force[1] *= deltaE;
force[2] *= deltaE; force[2] *= deltaE;
#ifdef AMOEBA_DEBUG
debugArray[0].x = r;
debugArray[0].y = deltaE;
debugArray[0].z = combindedSigma;
debugArray[0].w = combindedEpsilon;
debugArray[1].x = tau;
debugArray[1].y = rho;
debugArray[1].z = gTau;
debugArray[1].w = r;
#endif
} }
// perform reduction of force on H's and add to heavy atom partner // perform reduction of force on H's and add to heavy atom partner
...@@ -504,22 +497,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -504,22 +497,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG_PRINT
static const char* methodName = "kCalculateAmoebaVdw14_7Forces";
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: \n", methodName );
(void) fflush( amoebaGpu->log );
}
#ifdef AMOEBA_DEBUG
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
int targetAtom = 1;
#endif
#endif
// set threads/block first time through // set threads/block first time through
// on first pass, set threads/block // on first pass, set threads/block
...@@ -535,44 +512,9 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -535,44 +512,9 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads); threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
} }
#ifdef AMOEBA_DEBUG_PRINT
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "\n\nzCoordPreCopyVdw", stderr );
}
#endif
kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates ); kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates ); kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );
#ifdef AMOEBA_DEBUG_PRINT
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
(void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
if( 0 ){
gpu->psInteractionCount->Download();
amoebaGpu->psVdwWorkUnit->Download();
unsigned int totalWarps = (gpu->sim.nonbond_blocks*threadsPerBlock)/GRID;
float ratiof = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
(void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
unsigned int x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float warp = (float)(ii)*ratiof;
(void) fprintf( amoebaGpu->log, "GpuCell %8u [%5u %5u %1u] %10u warp=%15.6f\n", ii, x,y,exclusions, warp );
}
}
(void) fflush( amoebaGpu->log );
}
#endif
// clear output arrays // clear output arrays
kClearFields_3( amoebaGpu, 1 ); kClearFields_3( amoebaGpu, 1 );
...@@ -588,33 +530,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -588,33 +530,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksVdwPeriodic"); LAUNCHERROR("kFindInteractionsWithinBlocksVdwPeriodic");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psVdwWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysData[ii], x,y,exclusions, gpu->psInteractionFlag->_pSysData[ii] );
x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
}
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>( kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractingWorkUnit,
...@@ -622,12 +537,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -622,12 +537,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData, amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule, amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else { } else {
kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>( kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
...@@ -636,13 +546,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -636,13 +546,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData, amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule, amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaVdw14_7Cutoff"); LAUNCHERROR("kCalculateAmoebaVdw14_7Cutoff");
...@@ -656,12 +560,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -656,12 +560,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData, amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule, amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else { } else {
kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>( kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
...@@ -670,112 +569,15 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff ...@@ -670,112 +569,15 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData, amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule, amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData ); amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} }
LAUNCHERROR("kCalculateAmoebaVdw14_7N2"); LAUNCHERROR("kCalculateAmoebaVdw14_7N2");
} }
#ifdef AMOEBA_DEBUG_PRINT
if( amoebaGpu->log ){
static int iteration = 0;
(void) fprintf( amoebaGpu->log, "Finished 14-7 kernel execution step=%d\n", ++iteration );
(void) fflush( amoebaGpu->log );
#ifdef AMOEBA_DEBUG
debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
double cutOff = 1.0e+03;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugVdw\n", targetAtom, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
if( kk == 4 && ( fabs( debugArray->_pSysData[debugIndex].x ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].y ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].z ) > cutOff ) ){
(void) fprintf( amoebaGpu->log," XXXX\n" );
}
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
#endif
/*
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psWorkArray_3_1->Download();
//for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
for( int kk = 0; kk < gpu->sim.outputBuffers; kk++ ){
float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] ) )
(void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj],
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+2] );
}
}
*/
}
#endif
#ifdef AMOEBA_DEBUG
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "PreVdw", stderr );
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, iteration, "zCoordPreVdw", stderr );
}
#endif
kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 ); kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 );
#ifdef AMOEBA_DEBUG
if( 0 ){
static int iteration = 0;
checkForNans( gpu->natoms, 3, amoebaGpu->psWorkArray_3_2, gpu->psAtomIndex->_pSysData, ++iteration, "Vdw32", stderr );
}
#endif
kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 ); kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 ); kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
#ifdef AMOEBA_DEBUG
if( 0 ){
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* psTempForce = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce");
kClearFloat4( amoebaGpu, paddedNumberOfAtoms, psTempForce );
//kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, psTempForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaVdw", fileId, outputVector );
delete psTempForce;
//exit(0);
}
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "VdwForce", stderr );
}
#endif
#ifdef AMOEBA_DEBUG
delete debugArray;
#endif
// ---------------------------------------------------------------------------------------
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment