Commit 2b508482 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Added copyright

Removed debugging code
parent 36762962
......@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
......@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw OpenMMException( msg.str() );
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform platform1;
OpenCLPlatform platform2;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform platform1;
CudaPlatform platform2;
#else
Platform& platform1 = Platform::getPlatformByName( platformName1 );
if( deviceId1 ){
setDeviceId( platform1, deviceId1, log );
}
setDeviceIdUsingEnvVariable( platform1, log );
Context context1( system1, integrator1, platform1 );
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
Platform& platform2 = Platform::getPlatformByName( platformName2 );
if( deviceId2 ){
setDeviceId( platform2, deviceId2, log );
}
setDeviceIdUsingEnvVariable( platform2, log );
#endif
Context context1( system1, integrator1, platform1 );
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
Context context2( system2, integrator2, platform2 );
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
double energyDiff = 0.0;
......@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet = checkRequiredLibsAreAvailable( requiredLibs, loadedLibs, log );
if( envVariableIsSet == 0 && log ){
(void) fprintf( log, "Aborting tests due to missing libs.\n" );
}
#else
// unit test path: force tests to run
envVariableIsSet = 1;
#endif
if( platformId2s.size() > 0 ){
generativeArgumentMaps["platformId2"] = platformId2s;
......
......@@ -61,6 +61,17 @@
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "OpenMM.h"
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
#include "ReferencePlatform.h"
#include "OpenCLPlatform.h"
#endif
#if TEST_PLATFORM == TEST_CUDA_PLATFORM
#include "ReferencePlatform.h"
#include "CudaPlatform.h"
#endif
#ifdef USE_SOFTCORE
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
......@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
throw OpenMMException( msg.str() );
}
#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
ReferencePlatform platform1;
OpenCLPlatform platform2;
#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
ReferencePlatform platform1;
CudaPlatform platform2;
#else
Platform& platform1 = Platform::getPlatformByName( platformName1 );
if( deviceId1 ){
setDeviceId( platform1, deviceId1, log );
}
setDeviceIdUsingEnvVariable( platform1, log );
Context context1( system1, integrator1, platform1 );
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
Platform& platform2 = Platform::getPlatformByName( platformName2 );
if( deviceId2 ){
setDeviceId( platform2, deviceId2, log );
}
setDeviceIdUsingEnvVariable( platform2, log );
#endif
Context context1( system1, integrator1, platform1 );
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
Context context2( system2, integrator2, platform2 );
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
double energyDiff = 0.0;
......@@ -3419,10 +3439,19 @@ int main() {
}
}
// if TEST_PLATFORM is not set, then check that required libs are available
#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
envVariableIsSet = checkRequiredLibsAreAvailable( requiredLibs, loadedLibs, log );
if( envVariableIsSet == 0 && log ){
(void) fprintf( log, "Aborting tests due to missing libs.\n" );
}
#else
// unit test path: force tests to run
envVariableIsSet = 1;
#endif
if( platformId2s.size() > 0 ){
generativeArgumentMaps["platformId2"] = platformId2s;
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -591,8 +611,7 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato
// reduce psWorkArray_3_1 -> torque
static void kReduceTorque(amoebaGpuContext amoebaGpu )
{
static void kReduceTorque(amoebaGpuContext amoebaGpu ){
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
......@@ -609,21 +628,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
--------------------------------------------------------------------------------------- */
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
{
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaElectrostatic";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce ){
// ---------------------------------------------------------------------------------------
......@@ -631,20 +636,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
static const int maxSlots =20;
int paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
//unsigned int targetAtom = 1137;
unsigned int targetAtom = 1;
#endif
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
......@@ -662,20 +653,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kClearFields_3( amoebaGpu, 1 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
} else {
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
}
#else
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
......@@ -683,66 +660,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
}
#endif
LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
#ifdef AMOEBA_DEBUG
if( 0 ){
debugArray->Download();
std::vector<double> conversions;
conversions.push_back( 0.1f/4.184f );
conversions.push_back( 0.1f/4.184f );
unsigned int kkBlocks = 4;
(void) fprintf( stderr, "\nTarget atom output %5u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
double sum = 0.0;
for( unsigned int kk = 0; kk < kkBlocks && sum == 0.0; kk++ ){
unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
sum += debugArray->_pSysData[index].x + debugArray->_pSysData[index].y + debugArray->_pSysData[index].z + debugArray->_pSysData[index].w;
}
if( sum > 0.0 ){
(void) fprintf( stderr, "%5u", ii );
for( unsigned int kk = 0; kk < kkBlocks; kk++ ){
unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
(void) fprintf( stderr, " %15.7e %15.7e %15.7e %5.1f",
conversions[kk]*debugArray->_pSysData[index].x, conversions[kk]*debugArray->_pSysData[index].y, conversions[kk]*debugArray->_pSysData[index].z,
debugArray->_pSysData[index].w );
if( ((kk+1) % 2) == 0 && (kk != (kkBlocks-1) ) ){
(void) fprintf( stderr, "\n%5u", ii );
}
}
(void) fprintf( stderr, "\n" );
if( kkBlocks > 2 ){
(void) fprintf( stderr, "\n" );
}
}
}
}
#endif
#ifdef AMOEBA_DEBUG
if( 0 ){
VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "ElectrostaticTemp");
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat( amoebaGpu->psWorkArray_3_1, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
delete temp;
}
#endif
if( addTorqueToForce ){
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
......
......@@ -35,12 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
unsigned int* workUnit, float* outputTorque
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
unsigned int* workUnit, float* outputTorque){
extern __shared__ volatile ElectrostaticParticle sA[];
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -31,6 +49,7 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
// reduce psWorkArray_3_1 -> E_Field
// reduce psWorkArray_3_2 -> E_FieldPolar
// reduce psWorkArray_3_3 -> Gk_FieldPolar
......@@ -66,10 +85,6 @@ __device__ void calculateFixedGkFieldPairIxn_kernel( float4 atomCoordinatesI,
float* labFrameQuadrupoleI, float* labFrameQuadrupoleJ,
float rb2,
float outputField[2][3]
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
){
float xi,yi,zi;
......@@ -325,32 +340,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static const char* methodName = "computeCudaAmoebaFixedEAndGKFields";
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
// N2 debug array
int maxSlots = 10;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
debugArray->Upload();
// print intermediate results for the targetAtom
unsigned int targetAtom = 0;
#endif
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
......@@ -367,133 +360,21 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
kClearFields_3( amoebaGpu, 3 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_3->_pDevData );
#endif
} else {
kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_3->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_3->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");
kReduceEAndGkFields( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psWorkArray_3_3->Download();
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psGk_Field->Download();
gpu->psBornRadii->Download();
(void) fprintf( amoebaGpu->log, "OutE & Gk Fields\n" );
int maxPrint = 32;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
// Gk_Field polar
(void) fprintf( amoebaGpu->log,"Gk[%16.9e %16.9e %16.9e] ",
amoebaGpu->psGk_Field->_pSysData[indexOffset],
amoebaGpu->psGk_Field->_pSysData[indexOffset+1],
amoebaGpu->psGk_Field->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
(void) fprintf( amoebaGpu->log, "DebugQ\n" );
debugArray->Download();
if( 0 ){
int ii = targetAtom;
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
ii, jj,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w,
gpu->psBornRadii->_pSysData[jj] );
for( int kk = 0; kk < 2; kk++ ){
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
// write results to file
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psGk_Field, outputVector, NULL, 1.0f);
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );
}
delete debugArray;
}
#endif
}
......@@ -38,15 +38,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
unsigned int* workUnit,
float* outputEField,
float* outputEFieldPolar,
float* outputGkField
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4 pullBack[12];
#endif
float* outputGkField){
extern __shared__ FixedFieldParticle sA[];
......@@ -127,11 +119,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
unsigned int match = (atomI == (y + j)) ? 1 : 0;
......@@ -150,11 +138,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
calculateFixedGkFieldPairIxn_kernel( iCoord, jCoord,
&(labFrameDipole[atomI*3]), jDipole,
&(labFrameQuadrupole[atomI*9]), jQuadrupole,
bornRadii[atomI]*jBornRadius, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
bornRadii[atomI]*jBornRadius, ijField);
gkFieldSum[0] += match ? 0.0f : ijField[0][0];
gkFieldSum[1] += match ? 0.0f : ijField[0][1];
......@@ -179,11 +163,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
float dScaleVal;
......@@ -206,85 +186,12 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
eFieldPolarSum[1] += match ? 0.0f : pScaleVal*ijField[0][1];
eFieldPolarSum[2] += match ? 0.0f : pScaleVal*ijField[0][2];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel( iCoord, jCoord,
&(labFrameDipole[atomI*3]), jDipole,
&(labFrameQuadrupole[atomI*9]), jQuadrupole,
bornRadii[atomI]*jBornRadius, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
bornRadii[atomI]*jBornRadius, ijField);
match = (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ? 1 : 0;
gkFieldSum[0] += match ? 0.0f : ijField[0][0];
......@@ -292,45 +199,6 @@ if( 0 && atomI == targetAtom ){
gkFieldSum[2] += match ? 0.0f : ijField[0][2];
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
///unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
pullBackIndex++;
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += match ? 0.0f : cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexJ][0];
debugArray[index].y = match ? 0.0f : ijField[indexJ][1];
debugArray[index].z = match ? 0.0f : ijField[indexJ][2];
}
#endif
}
}
......@@ -377,11 +245,7 @@ if( atomI == targetAtom ){
loadFixedFieldParticleData( &(psA[tj]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
......@@ -403,84 +267,12 @@ if( atomI == targetAtom ){
psA[tj].eFieldP[1] += ijField[1][1];
psA[tj].eFieldP[2] += ijField[1][2];
// #ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
// Gk field
calculateFixedGkFieldPairIxn_kernel( iCoord, jCoord,
&(labFrameDipole[atomI*3]), jDipole,
&(labFrameQuadrupole[atomI*9]), jQuadrupole,
bornRadii[atomI]*jBornRadius, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
bornRadii[atomI]*jBornRadius, ijField);
gkFieldSum[0] += ijField[0][0];
gkFieldSum[1] += ijField[0][1];
......@@ -491,41 +283,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA[tj].gkField[2] += ijField[1][2];
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
//unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
}
#endif
tj = (tj + 1) & (GRID - 1);
......@@ -548,11 +305,7 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
loadFixedFieldParticleData( &(psA[tj]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
float dScaleVal;
float pScaleVal;
......@@ -579,85 +332,12 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
psA[tj].eFieldP[1] += pScaleVal*ijField[1][1];
psA[tj].eFieldP[2] += pScaleVal*ijField[1][2];
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
// unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
// unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2];
*/
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
// GK field
calculateFixedGkFieldPairIxn_kernel( iCoord, jCoord,
&(labFrameDipole[atomI*3]), jDipole,
&(labFrameQuadrupole[atomI*9]), jQuadrupole,
bornRadii[atomI]*jBornRadius, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
bornRadii[atomI]*jBornRadius, ijField);
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
gkFieldSum[0] += ijField[0][0];
......@@ -668,61 +348,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA[tj].gkField[1] += ijField[1][1];
psA[tj].gkField[2] += ijField[1][2];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
// pullBackIndex += 2;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexI][2] : 0.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][0] : 0.0f;
debugArray[index].y = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][1] : 0.0f;
debugArray[index].z = (atomI == targetAtom || (y + tj) == targetAtom) ? ijField[indexJ][2] : 0.0f;
/*
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -76,24 +95,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
static const char* methodName = "computeCudaAmoebaFixedEField";
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
// N2 debug array
CUDAStream<float4>* debugArray = new CUDAStream<float4>(10*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
// print intermediate results for the targetAtom
unsigned int targetAtom = 3;
#endif
kClearFields_3( amoebaGpu, 2 );
static unsigned int threadsPerBlock = 0;
......@@ -108,15 +109,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaFixedEField numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
......@@ -131,78 +123,5 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
#if 0
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysData[kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReduceE_Fields_kernel( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
gpu->psInteractionCount->Download();
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
(void) fprintf( amoebaGpu->log, "OutEFields\n" );
int maxPrint = 32;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// E_Field
(void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_Field->_pSysData[indexOffset],
amoebaGpu->psE_Field->_pSysData[indexOffset+1],
amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
// E_Field polar
(void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
(void) fprintf( amoebaGpu->log, "EFields End\n" );
// write results to file
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
//delete debugArray;
}
#endif
}
......@@ -37,15 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
unsigned int* workUnit,
float* outputEField,
float* outputEFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4 pullBack[12];
#endif
float* outputEFieldPolar){
extern __shared__ FixedFieldParticle sA[];
......@@ -106,11 +98,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
float ijField[2][3];
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
unsigned int match = (atomI == (y + j)) ? 1 : 0;
......@@ -142,11 +130,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
//loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
float dScaleVal;
float pScaleVal;
......@@ -168,73 +152,6 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
fieldPolarSum[1] += match ? 0.0f : pScaleVal*ijField[0][1];
fieldPolarSum[2] += match ? 0.0f : pScaleVal*ijField[0][2];
#ifdef AMOEBA_DEBUG
if( 0 && atomI == targetAtom ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
// unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 1;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : dScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : dScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : pScaleVal*ijField[indexI][0];
debugArray[index].y = match ? 0.0f : pScaleVal*ijField[indexI][1];
debugArray[index].z = match ? 0.0f : pScaleVal*ijField[indexI][2];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
}
}
......@@ -274,11 +191,7 @@ if( 0 && atomI == targetAtom ){
float ijField[2][3];
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
......@@ -300,74 +213,6 @@ if( 0 && atomI == targetAtom ){
psA[tj].eFieldP[1] += ijField[1][1];
psA[tj].eFieldP[2] += ijField[1][2];
#ifdef AMOEBA_DEBUG
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
/*
debugArray[index].z = pullBack[pullBackIndex++];
debugArray[index].w = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
#if 0
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
#endif
}
#endif
tj = (tj + 1) & (GRID - 1);
}
......@@ -388,11 +233,7 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
float ijField[2][3];
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
float dScaleVal;
float pScaleVal;
......@@ -419,79 +260,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
psA[tj].eFieldP[1] += pScaleVal*ijField[1][1];
psA[tj].eFieldP[2] += pScaleVal*ijField[1][2];
#ifdef AMOEBA_DEBUG
if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
// unsigned int pullBackIndex = 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = dScaleVal;
debugArray[index].w = pScaleVal;
/*
pullBackIndex += 2;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex++];
debugArray[index].y = pullBack[pullBackIndex++];
debugArray[index].z = pullBack[pullBackIndex++];
*/
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = dScaleVal*ijField[indexI][0];
debugArray[index].y = dScaleVal*ijField[indexI][1];
debugArray[index].z = dScaleVal*ijField[indexI][2];
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pScaleVal*ijField[indexI][0];
debugArray[index].y = pScaleVal*ijField[indexI][1];
debugArray[index].z = pScaleVal*ijField[indexI][2];
/*
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleVal + 10.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
*/
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
......
......@@ -151,11 +151,7 @@ __device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParti
// body of fixed E-field calculation
__device__ static void calculateFixedEFieldPairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float field[2][3]
#ifdef AMOEBA_DEBUG
, float4 debugArray[12]
#endif
)
float field[2][3])
{
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -204,44 +224,12 @@ void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateAmoebaGrycukBornRadii";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
amoebaGpu->scalingDistanceCutoff );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
gpu->psBornRadii->Download();
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysData[ii] );
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static unsigned int threadsPerBlock = 0;
......@@ -255,28 +243,7 @@ void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
(gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
......@@ -330,13 +297,7 @@ __device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRulePa
}
//#define AMOEBA_DEBUG
__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3]
#ifdef AMOEBA_DEBUG
, float4 pullDebug[5]
#endif
){
__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] ){
const float pi = 3.1415926535897f;
float third = 1.0f/3.0f;
......@@ -390,18 +351,6 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
float dbr = term * de/r;
de = dbr*atomI.bornForce;
#ifdef AMOEBA_DEBUG
pullDebug[0].x = de;
pullDebug[0].y = r;
pullDebug[0].z = factor;
pullDebug[0].w = -4.0f;
pullDebug[1].x = atomI.bornForce/4.184f;
pullDebug[1].y = atomI.bornRadius;
pullDebug[1].z = atomJ.bornForce/4.184f;
pullDebug[1].w = -5.0f;
#endif
force[0] = xr*de;
force[1] = yr*de;
force[2] = zr*de;
......@@ -430,43 +379,12 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateGrycukGbsaForces2";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(20*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*20*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
gpu->psBornRadii->Download();
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysData[ii] );
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static unsigned int threadsPerBlock = 0;
......@@ -480,64 +398,15 @@ void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
(gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
//kClearForces( gpu );
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
,debugArray->_pDevData, targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
} else {
kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
,debugArray->_pDevData, targetAtom
#endif
);
kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
}
LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugGrycukChain\n", targetAtom, jj );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
#endif
if( 0 ){
static int callId = 0;
gpuContext gpu = amoebaGpu->gpuContext;
......
......@@ -34,11 +34,7 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit ){
extern __shared__ GrycukChainRuleParticle sAChainRule[];
......@@ -49,10 +45,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
#ifdef AMOEBA_DEBUG
float4 pullDebug[5];
#endif
while (pos < end)
{
......@@ -85,11 +77,7 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
{
float localForce[3];
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce);
if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){
localParticle.force[0] -= localForce[0];
......@@ -100,54 +88,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
psAChainRule[j].force[1] += localForce[1];
psAChainRule[j].force[2] += localForce[2];
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + j) : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = -1.0f;
debugArray[index].w = -1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = -2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -12.0f;
calculateGrycukChainRulePairIxn_kernel( psAChainRule[j], localParticle, localForce , pullDebug );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = -13.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -14.0f;
}
#endif
}
}
......@@ -181,11 +121,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
float localForce[3];
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce );
localParticle.force[0] -= localForce[0];
localParticle.force[1] -= localForce[1];
......@@ -195,54 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
psAChainRule[tj].force[1] += localForce[1];
psAChainRule[tj].force[2] += localForce[2];
#ifdef AMOEBA_DEBUG
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = -1.0f;
debugArray[index].w = -1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = -2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -10.0f;
}
#endif
calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -11.0f;
}
#endif
calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce);
localParticle.force[0] += localForce[0];
localParticle.force[1] += localForce[1];
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
//#include "amoebaGpuTypes.h"
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define INCLUDE_TORQUE
#include "kCalculateAmoebaCudaKirkwoodParticle.h"
extern void kCalculateObcGbsaForces2(gpuContext gpu);
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -1638,26 +1657,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
// reduce psWorkArray_1_1 -> dBorn
// reduce psWorkArray_1_2 -> dBornPolar
#ifdef AMOEBA_DEBUG
static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
/*
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn1");
*/
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData, 0 );
LAUNCHERROR("kReduce_dBorn2");
}
#endif
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
......@@ -1940,40 +1939,6 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
amoebaGpu->gpuContext->psBornForce->_pDevData );
}
LAUNCHERROR("kReduceToBornForcePrefactor");
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
// kClearEnergy() should be called prior to kReduceToBornForcePrefactorAndSASA_kernel
double energy = kReduceEnergy( amoebaGpu->gpuContext );
(void) fprintf( amoebaGpu->log, "Born force w/ cavity energy=%15.7e.\n", energy ); (void) fflush( amoebaGpu->log );
/*
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
(void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
}
(void) fflush( amoebaGpu->log );
*/
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( amoebaGpu->gpuContext->natoms, 3, amoebaGpu->gpuContext->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornRadii, outputVector, NULL, 1.0f );
cudaLoadCudaFloat2Array( amoebaGpu->gpuContext->natoms, 2, amoebaGpu->gpuContext->psObcData, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( amoebaGpu->gpuContext->natoms, 1, amoebaGpu->gpuContext->psBornForce, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaBornForce", fileId, outputVector );
}
}
#endif
#undef AMOEBA_DEBUG
}
/**---------------------------------------------------------------------------------------
......@@ -1988,44 +1953,11 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateAmoebaKirkwood";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
amoebaGpu->scalingDistanceCutoff );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
gpu->psBornRadii->Download();
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Kirkwood input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysData[ii] );
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static unsigned int threadsPerBlock = 0;
......@@ -2040,47 +1972,17 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
}
kClearFields_1( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
(gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom );
#else
);
#endif
gpu->psWorkUnit->_pDevData);
} else {
kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
, debugArray->_pDevData, targetAtom );
#else
);
#endif
gpu->psWorkUnit->_pDevData);
}
LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces");
......
......@@ -35,15 +35,7 @@ __launch_bounds__(128, 1)
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
unsigned int* workUnit
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4 pullBack[20];
#endif
unsigned int* workUnit){
extern __shared__ KirkwoodParticle sA[];
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#include "kCalculateAmoebaCudaKirkwoodParticle.h"
#include "kCalculateAmoebaCudaKirkwoodEDiffParticle.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
......@@ -865,19 +884,6 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
#endif
#ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f;
debugArray[index].w = idLabel;
return index;
}
#endif
__device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffParticle* sA )
{
// zero shared fields
......@@ -929,24 +935,12 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu )
void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static int timestep = 0;
timestep++;
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateAmoebaKirkwoodEDiff";
std::vector<int> fileId;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
//
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
......@@ -960,23 +954,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
" ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u step=%d\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock, timestep );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
} else {
kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
}
......
......@@ -35,8 +35,7 @@ __launch_bounds__(96, 1)
__launch_bounds__(32, 1)
#endif
void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
unsigned int* workUnit, float* outputTorque
){
unsigned int* workUnit, float* outputTorque){
extern __shared__ KirkwoodEDiffParticle sA[];
......
......@@ -1616,19 +1616,7 @@ void kCalculateAmoebaLocalForces_kernel()
void kCalculateAmoebaLocalForces(amoebaGpuContext gpu)
{
#ifdef AMOEBA_DEBUG
if( gpu->log ){
static int call = 1;
if( call == 0 ){
(void) fprintf( gpu->log,"kCalculateAmoebaLocalForces: blks=%u thrds/blk=%u\n",
gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block); fflush( gpu->log );
call++;
}
}
#endif
kCalculateAmoebaLocalForces_kernel<<<gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block>>>();
LAUNCHERROR("kCalculateAmoebaLocalForces");
}
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#define BLOCK_SIZE 128
using namespace std;
static __constant__ cudaGmxSimulation cSim;
......@@ -469,41 +491,4 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
amoebaAddMapTorqueForceToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
LAUNCHERROR("amoebaAddMapTorqueForceToForce");
}
#ifdef AMOEBA_DEBUG
if( 0 ){
VectorOfDoubleVectors outputVector;
std::vector<int> fileId;
static int call = 0;
fileId.push_back( call++ );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* temp = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "TempMapTorqueAndAddToForce2");
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, NULL, 1.0f/4.184f );
for( int pId = 0; pId < 5; pId++ ){
float sum[3] = { 0.0f, 0.0f, 0.0f };
(void) fprintf( stderr, "\n\nTorqueForceToForce for part=%d\n", pId );
for( int ii = 0; ii < amoebaGpu->amoebaSim.maxTorqueBufferIndex; ii++ ){
(void) fprintf( stderr, "%4d [%15.7e %15.7e %15.7e]\n", ii,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y,
amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z );
sum[0] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x;
sum[1] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y;
sum[2] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z;
}
(void) fprintf( stderr, "TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e]\n", pId, sum[0], sum[1], sum[2], sum[0]/4.184f, sum[1]/4.184f, sum[2]/4.184f );
}
cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPostTorqueForce", fileId, outputVector );
delete temp;
}
#endif
}
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
......@@ -33,21 +55,12 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK
__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float fields[8][3]
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
float fields[8][3] )
{
float deltaR[3];
......@@ -119,13 +132,7 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedPa
}
__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float gkField[8][3]
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
float gkField[8][3] )
{
float gux[5];
......@@ -213,20 +220,6 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced
}
#ifdef AMOEBA_DEBUG
__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
{
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = addMask ? field[0] : 0.0f;
debugArray[index].y = addMask ? field[1] : 0.0f;
debugArray[index].z = addMask ? field[2] : 0.0f;
debugArray[index].w = idLabel;
return index;
}
#endif
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
......@@ -324,12 +317,6 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
epsilon[0] = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
epsilon[0] = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
epsilon[2] = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
epsilon[3] = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
epsilon[4] = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
#endif
}
}
......@@ -462,20 +449,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
static int iteration = 1;
int targetAtom = 0;
static const char* methodName = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
#endif
// clear output arrays
kClearFields_3( amoebaGpu, 4 );
......@@ -493,28 +466,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_4->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_4->_pDevData );
#endif
} else {
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
......@@ -522,120 +480,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_3->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_4->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_4->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaMutualInducedAndGkFields");
kReduceMutualInducedAndGkFields( amoebaGpu, outputArray, outputPolarArray, outputArrayS, outputPolarArrayS );
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
amoebaGpu->psWorkArray_3_3->Download();
amoebaGpu->psWorkArray_3_4->Download();
if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download();
outputPolarArray->Download();
outputArrayS->Download();
outputPolarArrayS->Download();
debugArray->Download();
int maxPrint = 33;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// Mi
(void) fprintf( amoebaGpu->log," Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysData[indexOffset],
outputArray->_pSysData[indexOffset+1],
outputArray->_pSysData[indexOffset+2] );
// Mi polar
(void) fprintf( amoebaGpu->log," MultP[%16.9e %16.9e %16.9e]\n",
outputPolarArray->_pSysData[indexOffset],
outputPolarArray->_pSysData[indexOffset+1],
outputPolarArray->_pSysData[indexOffset+2] );
// MiS
(void) fprintf( amoebaGpu->log," MultS[%16.9e %16.9e %16.9e] ",
outputArrayS->_pSysData[indexOffset],
outputArrayS->_pSysData[indexOffset+1],
outputArrayS->_pSysData[indexOffset+2] );
// Mi polarS
(void) fprintf( amoebaGpu->log,"MultPS[%16.9e %16.9e %16.9e]\n",
outputPolarArrayS->_pSysData[indexOffset],
outputPolarArrayS->_pSysData[indexOffset+1],
outputPolarArrayS->_pSysData[indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if( ii == targetAtom ){
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]\n",
ii, jj,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
for( int kk = 0; kk < 9; kk++ ){
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %5.1f]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
}
}
(void) fprintf( amoebaGpu->log,"\n" );
}
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
iteration++;
}
delete debugArray;
#endif
}
/**---------------------------------------------------------------------------------------
......@@ -649,22 +500,12 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static int timestep = 0;
timestep++;
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaMutualInducedAndGkFieldBySOR";
std::vector<int> fileId;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
int done;
int iteration;
static int timestep = 0;
timestep++;
gpuContext gpu = amoebaGpu->gpuContext;
......@@ -685,49 +526,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR Initial setup for matrix multiply\n" ); fflush( amoebaGpu->log );
gpu->psPosq4->Download();
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(),
amoebaGpu->psInducedDipolePolar->Download();
amoebaGpu->psInducedDipoleS->Download(),
amoebaGpu->psInducedDipolePolarS->Download();
amoebaGpu->psPolarizability->Download();
int offset = 0;
int maxPrint = 10;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "\n%4d pol=%12.4e\n", ii,
amoebaGpu->psPolarizability->_pSysData[offset] );
if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] );
}
(void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] Pos[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_Field->_pSysData[offset], amoebaGpu->psE_Field->_pSysData[offset+1], amoebaGpu->psE_Field->_pSysData[offset+2],
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2],
gpu->psPosq4->_pSysData[ii].x, gpu->psPosq4->_pSysData[ii].y, gpu->psPosq4->_pSysData[ii].z );
(void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_FieldPolar->_pSysData[offset], amoebaGpu->psE_FieldPolar->_pSysData[offset+1], amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2],
amoebaGpu->psInducedDipoleS->_pSysData[offset], amoebaGpu->psInducedDipoleS->_pSysData[offset+1], amoebaGpu->psInducedDipoleS->_pSysData[offset+2],
amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
offset += 3;
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii = (gpu->natoms - maxPrint);
}
(void) fflush( amoebaGpu->log );
}
#endif
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
......@@ -800,91 +598,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
done = 1;
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR step=%d iteration=%3d eps %14.6e done=%d\n",
timestep, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
}
if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
amoebaGpu->psInducedDipoleS->Download();
amoebaGpu->psInducedDipolePolarS->Download();
amoebaGpu->psWorkVector[2]->Download();
amoebaGpu->psWorkVector[3]->Download();
amoebaGpu->psGk_Field->Download();
#if 1
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#else
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysData[0],
amoebaGpu->psCurrentEpsilon->_pSysData[1],
amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
#endif
(void) fflush( amoebaGpu->log );
int offset = 0;
int maxPrint = 20;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]",
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log," MiS[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipoleS->_pSysData[offset], amoebaGpu->psInducedDipoleS->_pSysData[offset+1], amoebaGpu->psInducedDipoleS->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"MipS[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
/*
(void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
*/
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
}
#endif
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){
trackMutualInducedIterations( amoebaGpu, iteration );
}
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipoleS, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolarS, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "Cuda_GK_MI", fileId, outputVector );
}
#endif
// ---------------------------------------------------------------------------------------
}
void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext amoebaGpu )
......
......@@ -39,11 +39,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
float* outputField,
float* outputFieldPolar,
float* outputFieldS,
float* outputFieldPolarS
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
float* outputFieldPolarS){
extern __shared__ MutualInducedParticle sA[];
......@@ -112,11 +108,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
......@@ -138,27 +130,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
fieldPolarSumS[1] += mask ? ijField[5][1] : 0.0f;
fieldPolarSumS[2] += mask ? ijField[5][2] : 0.0f;
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = y + j;
if( atomI == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[0], mask, 1.0f );
index = debugAccumulate( index, debugArray, ijField[1], mask, 2.0f );
index = debugAccumulate( index, debugArray, ijField[4], mask, 3.0f );
index = debugAccumulate( index, debugArray, ijField[5], mask, 4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField);
// atomI == atomJ contribution included
......@@ -171,24 +143,6 @@ if( atomI == targetAtom ){
fieldPolarSumS[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSumS[2] += mask ? ijField[2][2] : 0.0f;
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, ijField[0], mask, 5.0f );
index = debugAccumulate( index, debugArray, ijField[2], mask, 6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = 9.0f;
}
#endif
}
// Write results
......@@ -213,9 +167,8 @@ if( atomI == targetAtom ){
load3dArray( offset, fieldPolarSumS, outputFieldPolarS );
#endif
}
else // 100% utilization
{
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
......@@ -235,11 +188,7 @@ if( atomI == targetAtom ){
// load coords, charge, ...
calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField);
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
......@@ -289,29 +238,7 @@ if( atomI == targetAtom ){
}
//#ifdef AMOEBA_DEBUG
#if 0
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = bornRadii[atomI];
debugArray[index].w = jBornRadius;
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -1.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+1], maskD, -2.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+4], maskD, -3.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
}
#endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField
#ifdef AMOEBA_DEBUG
, debugArray
#endif
);
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField);
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
......@@ -337,24 +264,6 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
psA[tj].fieldPolarS[2] += ijField[3][2];
}
//#ifdef AMOEBA_DEBUG
#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -5.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = -9.0f;
}
#endif
tj = (tj + 1) & (GRID - 1);
}
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
......@@ -33,19 +55,10 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
__device__ void calculateMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float fields[4][3]
#ifdef AMOEBA_DEBUG
, float4* debugArray
#endif
)
float fields[4][3] )
{
float deltaR[3];
......@@ -252,20 +265,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
int targetAtom = 1231;
static const char* methodName = "cudaComputeAmoebaMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s\n", methodName );
(void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
#endif
kClearFields_3( amoebaGpu, 2 );
if( threadsPerBlock == 0 ){
......@@ -279,169 +278,23 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaMutualInducedField");
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
if( amoebaGpu->log && iteration == -1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download();
outputPolarArray->Download();
debugArray->Download();
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// MI
(void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysData[indexOffset],
outputArray->_pSysData[indexOffset+1],
outputArray->_pSysData[indexOffset+2] );
// MI polar
(void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e] ",
outputPolarArray->_pSysData[indexOffset],
outputPolarArray->_pSysData[indexOffset+1],
outputPolarArray->_pSysData[indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysData[jj].x - gpu->psPosq4->_pSysData[ii].x;
float yy = gpu->psPosq4->_pSysData[jj].y - gpu->psPosq4->_pSysData[ii].y;
float zz = gpu->psPosq4->_pSysData[jj].z - gpu->psPosq4->_pSysData[ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
}
#endif
if( ii == targetAtom ){
float sums[4][3] = { { 0.0f, 0.0f, 0.0f },
{ 0.0f, 0.0f, 0.0f },
{ 0.0f, 0.0f, 0.0f },
{ 0.0f, 0.0f, 0.0f } };
(void) fprintf( amoebaGpu->log,"\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
unsigned int count = 0;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] ",
ii, jj,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
//debugIndex += gpu->natoms;
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
int index = 0;
sums[index][0] += debugArray->_pSysData[debugIndex].x;
sums[index][1] += debugArray->_pSysData[debugIndex].y;
sums[index][2] += debugArray->_pSysData[debugIndex].z;
if( count && ( (count % 31) == 0) ){
static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
(void) fprintf( amoebaGpu->log,"Block sum [%16.9e %16.9e %16.9e] ",
sums[index][0] - saveSum[0], sums[index][1] - saveSum[1], sums[index][2] - saveSum[2] );
saveSum[0] = sums[index][0];
saveSum[1] = sums[index][1];
saveSum[2] = sums[index][2];
}
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
index++;
sums[index][0] += debugArray->_pSysData[debugIndex].x;
sums[index][1] += debugArray->_pSysData[debugIndex].y;
sums[index][2] += debugArray->_pSysData[debugIndex].z;
if( count && ( (count % 31) == 0) ){
static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
(void) fprintf( amoebaGpu->log,"Block sumP [%16.9e %16.9e %16.9e] ",
sums[index][0] - saveSum[0], sums[index][1] - saveSum[1], sums[index][2] - saveSum[2] );
saveSum[0] = sums[index][0];
saveSum[1] = sums[index][1];
saveSum[2] = sums[index][2];
}
(void) fprintf( amoebaGpu->log,"\n" );
count++;
}
(void) fprintf( amoebaGpu->log,"\n" );
int index = 0;
(void) fprintf( amoebaGpu->log,"Sum1 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
(void) fprintf( amoebaGpu->log,"Sum2 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
(void) fprintf( amoebaGpu->log,"Sum3 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
(void) fprintf( amoebaGpu->log,"Sum4 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
}
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
iteration++;
}
delete debugArray;
#endif
}
/**---------------------------------------------------------------------------------------
......@@ -457,18 +310,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaMutualInducedFieldBySOR";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
int done;
int iteration;
......@@ -489,38 +330,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(),
amoebaGpu->psInducedDipolePolar->Download();
amoebaGpu->psPolarizability->Download();
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0;
int maxPrint = 20000;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii,
amoebaGpu->psPolarizability->_pSysData[offset] );
if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] );
}
(void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psE_Field->_pSysData[offset], amoebaGpu->psE_Field->_pSysData[offset+1], amoebaGpu->psE_Field->_pSysData[offset+2],
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_FieldPolar->_pSysData[offset], amoebaGpu->psE_FieldPolar->_pSysData[offset+1], amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
offset += 3;
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii = (gpu->natoms - maxPrint);
}
(void) fflush( amoebaGpu->log );
}
#endif
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
......@@ -558,12 +367,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReduceMutualInducedFieldDelta");
#ifdef AMOEBA_DEBUG
if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
trackMutualInducedIterations( amoebaGpu, iteration);
}
#endif
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
......@@ -572,54 +375,12 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1;
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e done=%d\n",
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
(void) fflush( amoebaGpu->log );
int offset = 0;
int maxPrint = 20;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
}
#endif
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
// cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector, NULL, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector, NULL, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaMI", fileId, outputVector );
}
#endif
// ---------------------------------------------------------------------------------------
}
void cudaComputeAmoebaMutualInducedField( amoebaGpuContext amoebaGpu )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment