"docs-source/vscode:/vscode.git/clone" did not exist on "3a0e748cdb5f73e6ba2e0fbf66dd230f41692bf0"
Commit 2b508482 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Added copyright

Removed debugging code
parent 36762962
......@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
unsigned int* workUnit,
float* outputField, float* outputFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
float* outputField, float* outputFieldPolar){
extern __shared__ MutualInducedParticle sA[];
......@@ -99,11 +95,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
// load coords, charge, ...
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
......@@ -117,34 +109,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
fieldPolarSum[1] += mask ? ijField[1][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom ){
unsigned int index = y + j;
unsigned int indexI = 0;
//unsigned int indexJ = 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
//debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].z = (float) cSim.atoms;
debugArray[index].w = (float) (mask + 1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) 1.0f;
}
#endif
}
// Write results
......@@ -161,9 +125,8 @@ if( atomI == targetAtom ){
#endif
}
else // 100% utilization
{
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
......@@ -185,13 +148,9 @@ if( atomI == targetAtom ){
// load coords, charge, ...
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField);
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -217,36 +176,8 @@ if( atomI == targetAtom ){
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
#ifdef AMOEBA_DEBUG
//#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
//unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].w = (float) (mask+1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) -1.0f;
}
#endif
tj = (tj + 1) & (GRID - 1);
tj = (tj + 1) & (GRID - 1);
}
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
......
///-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -213,11 +233,7 @@ __device__ void calculateBn_kernel( float r, float4* bn, float* bn0, float *bn5
#undef SUB_METHOD_NAME
__device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDirectElectrostaticParticle& atomI, const PmeDirectElectrostaticParticle& atomJ,
const float* scalingFactors, float4 forceTorqueEnergy[3]
#ifdef AMOEBA_DEBUG
,float4* debugArray
#endif
){
const float* scalingFactors, float4 forceTorqueEnergy[3]){
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
......@@ -909,132 +925,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire
forceTorqueEnergy[2].y = (ttm32 + ttm3i2);
forceTorqueEnergy[2].z = (ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG
int debugIndex = 0;
float idTracker = 1.0f;
/*
debugArray[debugIndex].x = atomI.labFrameDipole[0];
debugArray[debugIndex].y = atomI.labFrameDipole[1];
debugArray[debugIndex].z = atomI.labFrameDipole[2];
debugArray[debugIndex].w = r2;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.labFrameDipole[0];
debugArray[debugIndex].y = atomJ.labFrameDipole[1];
debugArray[debugIndex].z = atomJ.labFrameDipole[2];
debugArray[debugIndex].w = cSim.alphaEwald;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomI.inducedDipole[0];
debugArray[debugIndex].y = atomI.inducedDipole[1];
debugArray[debugIndex].z = atomI.inducedDipole[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.inducedDipole[0];
debugArray[debugIndex].y = atomJ.inducedDipole[1];
debugArray[debugIndex].z = atomJ.inducedDipole[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomI.inducedDipoleP[0];
debugArray[debugIndex].y = atomI.inducedDipoleP[1];
debugArray[debugIndex].z = atomI.inducedDipoleP[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = atomJ.inducedDipoleP[0];
debugArray[debugIndex].y = atomJ.inducedDipoleP[1];
debugArray[debugIndex].z = atomJ.inducedDipoleP[2];
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = conversionFactor*ftm21;
debugArray[debugIndex].y = conversionFactor*ftm22;
debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 1.0;
debugArray[debugIndex].x = e;
debugArray[debugIndex].y = ei;
debugArray[debugIndex].z = erl;
debugArray[debugIndex].w = erli;
debugIndex++;
*/
idTracker += 100.0;
debugArray[debugIndex].x = r2;
debugArray[debugIndex].y = cSim.alphaEwald;
debugArray[debugIndex].z = conversionFactor;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm21;
debugArray[debugIndex].y = conversionFactor*ftm22;
debugArray[debugIndex].z = conversionFactor*ftm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ftm2i1;
debugArray[debugIndex].y = conversionFactor*ftm2i2;
debugArray[debugIndex].z = conversionFactor*ftm2i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
/*
idTracker += 100.0;
debugArray[debugIndex].x = fridmp1;
debugArray[debugIndex].y = fridmp2;
debugArray[debugIndex].z = fridmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = findmp1;
debugArray[debugIndex].y = findmp2;
debugArray[debugIndex].z = findmp3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
*/
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm21;
debugArray[debugIndex].y = conversionFactor*ttm22;
debugArray[debugIndex].z = conversionFactor*ttm23;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm2i1;
debugArray[debugIndex].y = conversionFactor*ttm2i2;
debugArray[debugIndex].z = conversionFactor*ttm2i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm31;
debugArray[debugIndex].y = conversionFactor*ttm32;
debugArray[debugIndex].z = conversionFactor*ttm33;
debugArray[debugIndex].w = idTracker;
debugIndex++;
idTracker += 100.0;
debugArray[debugIndex].x = conversionFactor*ttm3i1;
debugArray[debugIndex].y = conversionFactor*ttm3i2;
debugArray[debugIndex].z = conversionFactor*ttm3i3;
debugArray[debugIndex].w = idTracker;
debugIndex++;
#endif
} else {
forceTorqueEnergy[0].x = 0.0f;
......@@ -1051,15 +941,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire
forceTorqueEnergy[0].w = 0.0f;
#ifdef AMOEBA_DEBUG
for( int ii = 0; ii < 12; ii++ ){
debugArray[ii].x = 0.0f;
debugArray[ii].y = 0.0f;
debugArray[ii].z = 0.0f;
debugArray[ii].w = (float) (-ii);
}
#endif
}
return;
......@@ -1351,35 +1232,11 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxOffset = 20;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxOffset*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxOffset*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 49;
#endif
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
......@@ -1403,37 +1260,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 1 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u maxL1=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, maxL1 );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -171,12 +191,7 @@ __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& at
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float4 fields[3]
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
){
float dscale, float pscale, float4 fields[3]){
// compute the real space portion of the Ewald summation
......@@ -329,18 +344,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields[2].w = 0.0f;
}
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
pullBack[0].z = zr;
pullBack[0].w = r2;
pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z;
pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
#endif
}
// Include versions of the kernels for N^2 calculations.
......@@ -361,40 +364,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
--------------------------------------------------------------------------------------- */
#ifdef AMOEBA_DEBUG
static int isNanOrInfinity( double number ){
return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0;
}
static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
{
int i, j, temp;
int test; /*use this only if unsure whether the list is already sorted or not*/
for(i = length - 1; i > 0; i--)
{
test=0;
for(j = 0; j < i; j++)
{
if(array[j] > array[j+1]) /* compare neighboring elements */
{
temp = array[j]; /* swap array[j] and array[j+1] */
array[j] = array[j+1];
array[j+1] = temp;
temp = track[j]; /* swap array[j] and array[j+1] */
track[j] = track[j+1];
track[j+1] = temp;
test=1;
}
} /*end for j*/
if(test==0) break; /*will exit if the list is sorted!*/
} /*end for i*/
}
#endif
/**---------------------------------------------------------------------------------------
Compute fixed electric field using PME
......@@ -409,22 +378,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
static const char* methodName = "computeCudaAmoebaPmeFixedEField";
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int slots = 15;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*slots, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*slots);
debugArray->Upload();
// print intermediate results for the targetAtom
unsigned int targetAtom = 0;
#endif
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
......@@ -444,175 +397,17 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
kReducePmeDirectE_Fields( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log );
/*
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n", gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// buffer 1
(void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset],
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+1],
amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+2] );
// buffer 2
(void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset],
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+1],
amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
*/
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
int maxPrint = 3002;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download();
std::vector<int> indices;
std::vector<int> track;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
int orderIndex = gpu->psAtomIndex->_pSysData[jj];
indices.push_back( orderIndex );
track.push_back( jj );
}
}
bubbleSort( indices, track, static_cast<int>(track.size()) );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
amoebaGpu->gpuContext->psPosq4->Download();
unsigned int count = 0;
float sum0[3] = { 0.0f, 0.0f, 0.0f };
float sum1[3] = { 0.0f, 0.0f, 0.0f };
int offset0 = 1;
int offset1 = 2;
/*
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
int orderIndex = gpu->psAtomIndex->_pSysData[jj];
count++;
*/
for( unsigned int ii = 0; ii < track.size(); ii++ ){
int jj = track[ii];
int debugIndex = jj;
int orderIndex = indices[ii];
if( orderIndex > 31 && offset0 == 1 ){
offset0 = 2;
offset1 = 2;
}
count++;
sum0[0] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].x;
sum0[1] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].y;
sum0[2] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].z;
sum1[0] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].x;
sum1[1] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].y;
sum1[2] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].z;
(void) fprintf( amoebaGpu->log,"%5d %5d %u PmeFixedEField\n", orderIndex, jj, count );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"%6d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %6d %6d cum sumsOp\n",
orderIndex, sum0[0], sum0[1], sum0[2], sum1[0], sum1[1], sum1[2], jj, count );
(void) fprintf( amoebaGpu->log,"\n" );
}
// write results to file
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
delete debugArray;
}
#endif
}
void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
......@@ -621,21 +416,4 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
//cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
exit(0);
}
#endif
}
......@@ -37,16 +37,7 @@ __launch_bounds__(64, 1)
void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int* workUnit,
float* outputEField,
float* outputEFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int maxPullIndex = 1;
float4 pullBack[12];
#endif
float* outputEFieldPolar){
extern __shared__ FixedFieldParticle sA[];
......@@ -118,11 +109,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
......@@ -139,66 +126,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
fieldPolarSum[1] += match ? 0.0f : ijField[1].z;
fieldPolarSum[2] += match ? 0.0f : ijField[2].z;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || targetAtom == (y+j) ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
float flag = 7.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].z;
debugArray[index].y = match ? 0.0f : ijField[1].z;
debugArray[index].z = match ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].y;
debugArray[index].y = match ? 0.0f : ijField[1].y;
debugArray[index].z = match ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 3.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].w;
debugArray[index].y = match ? 0.0f : ijField[1].w;
debugArray[index].z = match ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
}
// Write results
......@@ -252,11 +179,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField);
unsigned int outOfBounds = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;
......@@ -317,67 +240,8 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
}
}
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
float flag = 9.0f;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].x;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].x;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].y;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].y;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].z;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].z;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 3.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].w;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].w;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
}
tj = (tj + 1) & (GRID - 1);
tj = (tj + 1) & (GRID - 1);
} // j-loop block
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
......@@ -33,9 +55,6 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
......@@ -331,10 +350,6 @@ static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* ar
{
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
#endif
}
}
......@@ -416,18 +431,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
int targetAtom = 546;
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log );
}
#endif
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_3( amoebaGpu, 2 );
......@@ -444,17 +449,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
......@@ -474,43 +468,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d -- Direct only -- self added in kSorUpdateMutualInducedField_kernel\n",
iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download();
outputPolarArray->Download();
//debugArray->Download();
int maxPrint = 5;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// MI
(void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysData[indexOffset],
outputArray->_pSysData[indexOffset+1],
outputArray->_pSysData[indexOffset+2] );
// MI polar
(void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
outputPolarArray->_pSysData[indexOffset],
outputPolarArray->_pSysData[indexOffset+1],
outputPolarArray->_pSysData[indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
iteration++;
}
#endif
}
/**---------------------------------------------------------------------------------------
......@@ -526,25 +483,12 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// ---------------------------------------------------------------------------------------
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
int done;
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
gpuContext gpu = amoebaGpu->gpuContext;
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
......@@ -559,19 +503,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
std::vector<int> fileId;
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeEFieldPolarity", fileId, outputVector );
}
#endif
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
......@@ -609,12 +540,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
trackMutualInducedIterations( amoebaGpu, iteration);
}
#endif
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
......@@ -624,79 +549,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
done = 1;
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
#if 1
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#else
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[0],
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#endif
(void) fflush( amoebaGpu->log );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
/*
int offset = 0;
int maxPrint = 10;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysData[offset],
amoebaGpu->psInducedDipole->_pSysData[offset+1],
amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysData[offset],
amoebaGpu->psInducedDipolePolar->_pSysData[offset+1],
amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
*/
if( 0 ){
std::vector<int> fileId;
fileId.push_back( iteration );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
}
(void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
(void) fflush( amoebaGpu->log );
#endif
// exit if nan
if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
......@@ -710,25 +562,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
if( 0 ){
static int iteration = 0;
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData, ++iteration, "CudaPmeMI", stderr );
checkForNans( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr );
}
#endif
// ---------------------------------------------------------------------------------------
}
void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
......
......@@ -37,9 +37,6 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
unsigned int* workUnit,
float* outputField, float* outputFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
extern __shared__ MutualInducedParticle sA[];
......@@ -52,10 +49,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
unsigned int lasty = 0xFFFFFFFF;
const float uscale = 1.0f;
#ifdef AMOEBA_DEBUG
float4 pullBack[4];
#endif
while (pos < end)
{
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
......@@ -51,8 +73,6 @@ __device__ static float normVector3( float* vector )
return returnNorm;
}
#undef AMOEBA_DEBUG
// ZThenX == 0
// Bisector == 1
// ZBisect == 2
......@@ -379,10 +399,6 @@ void kCudaComputeLabFrameMoments_kernel( void )
void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static const char* methodName = "computeCudaAmoebaLabFrameMoments";
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
......@@ -390,33 +406,6 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
int numBlocks = gpu->sim.blocks;
int numThreads = gpu->sim.threads_per_block;
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log );
amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download();
amoebaGpu->psMolecularDipole->Download();
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Download();
gpu->psPosq4->Download();
for( int ii = 0; ii < gpu->natoms; ii++ ){
int mIndex = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s [%6d %6d %6d %6d]\n", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : ""),
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].z,
amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].w );
//if( ii == 30 )ii = gpu->natoms - 30;
}
}
#endif
// copy molecular moments to lab frame moment arrays
// check if chiral center requires moments to have sign flipped
// compute lab frame moments
......@@ -428,7 +417,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
LAUNCHERROR("kCudaComputeCheckChiral");
kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( );
LAUNCHERROR(methodName);
LAUNCHERROR("kCudaComputeLabFrameMoments");
}
......@@ -505,5 +494,3 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
}
}
#undef AMOEBA_DEBUG
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
......@@ -38,12 +60,6 @@ void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaVdw14_7Sim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG_PRINT
#undef AMOEBA_DEBUG_PRINT
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
__device__ void zeroVdw14_7SharedForce( struct Vdw14_7Particle* sA )
{
// zero shared fields
......@@ -101,11 +117,7 @@ __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule,
}
__device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float combindedEpsilon,
float force[3], float* energy
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
float force[3], float* energy)
{
const float deltaHalM1 = 0.07f;
......@@ -120,14 +132,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c
float r2 = force[0]*force[0] + force[1]*force[1] + force[2]*force[2];
if( r2 > cAmoebaSim.vdwCutoff2 ){
*energy = force[0] = force[1] = force[2] = 0.0f;
#ifdef AMOEBA_DEBUG
float rI = rsqrtf( r2 );
float r = 1.0f/rI;
debugArray[0].x = r;
debugArray[0].y = debugArray[0].z = debugArray[0].w = 0.0f;
debugArray[1].x = debugArray[1].y = debugArray[1].z = 0.0f;
debugArray[1].w = r;
#endif
return;
}
float rI = rsqrtf( r2 );
......@@ -156,17 +160,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma, float c
force[1] *= deltaE;
force[2] *= deltaE;
#ifdef AMOEBA_DEBUG
debugArray[0].x = r;
debugArray[0].y = deltaE;
debugArray[0].z = combindedSigma;
debugArray[0].w = combindedEpsilon;
debugArray[1].x = tau;
debugArray[1].y = rho;
debugArray[1].z = gTau;
debugArray[1].w = r;
#endif
}
// perform reduction of force on H's and add to heavy atom partner
......@@ -504,22 +497,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG_PRINT
static const char* methodName = "kCalculateAmoebaVdw14_7Forces";
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: \n", methodName );
(void) fflush( amoebaGpu->log );
}
#ifdef AMOEBA_DEBUG
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
int targetAtom = 1;
#endif
#endif
// set threads/block first time through
// on first pass, set threads/block
......@@ -535,44 +512,9 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG_PRINT
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "\n\nzCoordPreCopyVdw", stderr );
}
#endif
kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );
#ifdef AMOEBA_DEBUG_PRINT
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
(void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
if( 0 ){
gpu->psInteractionCount->Download();
amoebaGpu->psVdwWorkUnit->Download();
unsigned int totalWarps = (gpu->sim.nonbond_blocks*threadsPerBlock)/GRID;
float ratiof = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
(void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
unsigned int x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float warp = (float)(ii)*ratiof;
(void) fprintf( amoebaGpu->log, "GpuCell %8u [%5u %5u %1u] %10u warp=%15.6f\n", ii, x,y,exclusions, warp );
}
}
(void) fflush( amoebaGpu->log );
}
#endif
// clear output arrays
kClearFields_3( amoebaGpu, 1 );
......@@ -588,33 +530,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksVdwPeriodic");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){
gpu->psInteractionCount->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
amoebaGpu->psVdwWorkUnit->Download();
(void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
unsigned int x = gpu->psInteractingWorkUnit->_pSysData[ii];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
unsigned int exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, "GpuCell %8u %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysData[ii], x,y,exclusions, gpu->psInteractionFlag->_pSysData[ii] );
x = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
y = ((x >> 2) & 0x7fff) << GRIDBITS;
exclusions = (x & 0x1);
x = (x >> 17) << GRIDBITS;
(void) fprintf( amoebaGpu->log, " AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
}
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
......@@ -622,12 +537,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
......@@ -636,13 +546,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaVdw14_7Cutoff");
......@@ -656,12 +560,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
......@@ -670,112 +569,15 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
amoebaGpu->vdwSigmaCombiningRule,
amoebaGpu->vdwEpsilonCombiningRule,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaVdw14_7N2");
}
#ifdef AMOEBA_DEBUG_PRINT
if( amoebaGpu->log ){
static int iteration = 0;
(void) fprintf( amoebaGpu->log, "Finished 14-7 kernel execution step=%d\n", ++iteration );
(void) fflush( amoebaGpu->log );
#ifdef AMOEBA_DEBUG
debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
double cutOff = 1.0e+03;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugVdw\n", targetAtom, jj );
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
if( kk == 4 && ( fabs( debugArray->_pSysData[debugIndex].x ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].y ) > cutOff ||
fabs( debugArray->_pSysData[debugIndex].z ) > cutOff ) ){
(void) fprintf( amoebaGpu->log," XXXX\n" );
}
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
#endif
/*
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psWorkArray_3_1->Download();
//for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
for( int kk = 0; kk < gpu->sim.outputBuffers; kk++ ){
float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] ) )
(void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj],
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+2] );
}
}
*/
}
#endif
#ifdef AMOEBA_DEBUG
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "PreVdw", stderr );
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, iteration, "zCoordPreVdw", stderr );
}
#endif
kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 );
#ifdef AMOEBA_DEBUG
if( 0 ){
static int iteration = 0;
checkForNans( gpu->natoms, 3, amoebaGpu->psWorkArray_3_2, gpu->psAtomIndex->_pSysData, ++iteration, "Vdw32", stderr );
}
#endif
kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
#ifdef AMOEBA_DEBUG
if( 0 ){
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* psTempForce = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce");
kClearFloat4( amoebaGpu, paddedNumberOfAtoms, psTempForce );
//kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, psTempForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaVdw", fileId, outputVector );
delete psTempForce;
//exit(0);
}
if( 0 ){
static int iteration = 0;
checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "VdwForce", stderr );
}
#endif
#ifdef AMOEBA_DEBUG
delete debugArray;
#endif
// ---------------------------------------------------------------------------------------
}
......@@ -39,9 +39,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
int sigmaCombiningRule,
int epsilonCombiningRule,
float* outputForce
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
extern __shared__ Vdw14_7Particle sA[];
......@@ -57,9 +54,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
int exclusionMask;
float totalEnergy = 0.0f;
#ifdef AMOEBA_DEBUG
float4 pullDebug[5];
#endif
while (pos < end)
{
......@@ -129,11 +123,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
}
float energy;
calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy);
// mask out excluded ixns
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
......@@ -148,41 +138,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
forceSum[1] += mask ? ijForce[1] : 0.0f;
forceSum[2] += mask ? ijForce[2] : 0.0f;
totalEnergy += mask ? 0.5f*energy : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + j) : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = -1.0f;
debugArray[index].w = (float) (mask + 1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = energy;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijForce[0] : 0.0f;
debugArray[index].y = mask ? ijForce[1] : 0.0f;
debugArray[index].z = mask ? ijForce[2] : 0.0f;
}
#endif
}
// Write results
......@@ -260,11 +215,7 @@ flags = 0xFFFFFFFF;
ijForce[1] -= floor(ijForce[1]*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
ijForce[2] -= floor(ijForce[2]*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
}
calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy);
// mask out excluded ixns
......@@ -328,40 +279,6 @@ flags = 0xFFFFFFFF;
#endif
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+jIdx) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = -3.0;
debugArray[index].w = (float) (mask + 1);
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = energy;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijForce[0] : 0.0f;
debugArray[index].y = mask ? ijForce[1] : 0.0f;
debugArray[index].z = mask ? ijForce[2] : 0.0f;
}
#endif
#ifdef USE_CUTOFF
}
#endif
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
......@@ -35,9 +57,6 @@ void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaWcaDispersionSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
__device__ void zeroWcaDispersionSharedForce( struct WcaDispersionParticle* sA )
{
// zero shared fields
......@@ -105,14 +124,7 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
float radiusI, float radiusJ,
float rmixo, float rmixh,
float emixo, float emixh,
float force[3], float* energy
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
{
float force[3], float* energy ) {
const float pi = 3.1415926535897f;
const float shctd = cAmoebaSim.shctd;
......@@ -318,29 +330,6 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
force[1] *= de;
force[2] *= de;
#ifdef AMOEBA_DEBUG
debugArray[0].x = sum;
debugArray[0].y = sum;
debugArray[0].z = sum;
debugArray[0].w = sum;
#if 0
debugArray[0].x = r;
debugArray[0].y = -r*de/awater;
debugArray[0].z = emixo;
debugArray[0].w = mask2;
debugArray[1].x = dl;
debugArray[1].y = du;
debugArray[1].z = lik;
debugArray[1].w = uik;
debugArray[2].x = du1;
debugArray[2].y = du2;
debugArray[2].z = term;
debugArray[2].w = sk;
#endif
#endif
}
// Include versions of the kernels for N^2 calculations.
......@@ -385,16 +374,6 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
......
......@@ -24,56 +24,6 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for identifying interacting blocks. It is included
* several times in kCalculateCDLJForces.cu with different #defines to generate
* different versions of the kernels.
*/
/**
* Find a bounding box for the atoms in each block.
*/
/*
__global__ void METHOD_NAME(kFindBlockBounds, _kernel)()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int base = pos << GRIDBITS;
if (base < cSim.atoms)
{
float4 apos = cSim.pPosq[base];
#ifdef USE_PERIODIC
apos.x -= floor(apos.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
apos.y -= floor(apos.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
apos.z -= floor(apos.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
float4 firstPoint = apos;
#endif
float minx = apos.x;
float maxx = apos.x;
float miny = apos.y;
float maxy = apos.y;
float minz = apos.z;
float maxz = apos.z;
for (unsigned int i = 1; i < GRID; i++)
{
apos = cSim.pPosq[base+i];
#ifdef USE_PERIODIC
apos.x -= floor((apos.x-firstPoint.x)*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
apos.y -= floor((apos.y-firstPoint.y)*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
apos.z -= floor((apos.z-firstPoint.z)*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
minx = min(minx, apos.x);
maxx = max(maxx, apos.x);
miny = min(miny, apos.y);
maxy = max(maxy, apos.y);
minz = min(minz, apos.z);
maxz = max(maxz, apos.z);
}
cSim.pGridBoundingBox[pos] = make_float4(0.5f*(maxx-minx), 0.5f*(maxy-miny), 0.5f*(maxz-minz), 0);
cSim.pGridCenter[pos] = make_float4(0.5f*(maxx+minx), 0.5f*(maxy+miny), 0.5f*(maxz+minz), 0);
}
}
*/
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment