Commit 761d7e17 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of limitation for 'long-range in sequence' covalent bonds

Reduced memory footprint
parent 80c4976e
......@@ -118,7 +118,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -162,7 +162,7 @@ if( atomI == targetAtom ){
// atomI == atomJ contribution included
mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
mask = ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
fieldSumS[0] += mask ? ijField[0][0] : 0.0f;
fieldSumS[1] += mask ? ijField[0][1] : 0.0f;
fieldSumS[2] += mask ? ijField[0][2] : 0.0f;
......@@ -181,7 +181,7 @@ if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = 9.0f;
......@@ -195,7 +195,7 @@ if( atomI == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar );
......@@ -204,7 +204,7 @@ if( atomI == targetAtom ){
load3dArrayBufferPerWarp( offset, fieldPolarSumS, outputFieldPolarS );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
......@@ -241,7 +241,7 @@ if( atomI == targetAtom ){
#endif
);
if( (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
// add to field at atomI the field due atomJ's dipole
......@@ -294,7 +294,7 @@ if( atomI == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int maskD = (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms);
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
......@@ -314,7 +314,7 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
);
if( (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
fieldSumS[0] += ijField[0][0];
fieldSumS[1] += ijField[0][1];
......@@ -341,14 +341,14 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
#if 0
if( atomI == targetAtom || (y + tj) == targetAtom ){
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int maskD = (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms);
unsigned int maskD = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
index = debugAccumulate( index, debugArray, ijField[indexI], maskD, -5.0f );
index = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
index = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
index = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = bornRadii[atomI];
debugArray[index].y = jBornRadius;
debugArray[index].w = -9.0f;
......@@ -362,13 +362,13 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
load3dArrayBufferPerWarp( offset, fieldSumS, outputFieldS );
load3dArrayBufferPerWarp( offset, fieldPolarSumS, outputFieldPolarS );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
......@@ -376,13 +376,13 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolarS, outputFieldPolarS);
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
load3dArray( offset, fieldSumS, outputFieldS );
load3dArray( offset, fieldPolarSumS, outputFieldPolarS);
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].field, outputField );
load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
load3dArray( offset, sA[threadIdx.x].fieldS, outputFieldS );
......
......@@ -217,13 +217,14 @@ void kSorUpdateMutualInducedField_kernel(
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceMI_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
LAUNCHERROR("kReduceMI_Fields2");
}
......@@ -277,14 +278,14 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -296,7 +297,7 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
} else {
kCalculateAmoebaMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......
......@@ -105,7 +105,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -126,20 +126,20 @@ if( atomI == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
//debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].z = (float) cAmoebaSim.numberOfAtoms;
debugArray[index].z = (float) cSim.atoms;
debugArray[index].w = (float) (mask + 1);
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) 1.0f;
......@@ -150,12 +150,12 @@ if( atomI == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
......@@ -191,7 +191,7 @@ if( atomI == targetAtom ){
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -229,17 +229,17 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
debugArray[index].z = cAmoebaSim.pDampingFactorAndThole[atomI].x;
debugArray[index].w = (float) (mask+1);
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijField[indexI+1][0] : 0.0f;
debugArray[index].y = mask ? ijField[indexI+1][1] : 0.0f;
debugArray[index].z = mask ? ijField[indexI+1][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) -1.0f;
......@@ -253,21 +253,21 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].field, outputField );
load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
......
......@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "bbsort.h"
......@@ -982,7 +983,7 @@ void kRecordInducedDipoleField_kernel(float* output, float* outputPolar)
}
}
extern void cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float4>* psOutputForce);
extern void cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpuContext gpu, CUDAStream<float>* psTorque);
/**
* Compute the potential and forces due to the reciprocal space PME calculation for fixed multipoles.
......@@ -1014,7 +1015,21 @@ void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu)
LAUNCHERROR("kRecordFixedMultipoleField");
kComputeFixedMultipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeFixedMultipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceOnlyFixed", fileId, outputVector );
kClearForces( gpu );
}
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque);
}
/**
......@@ -1047,6 +1062,5 @@ void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu)
kComputeInducedDipoleForceAndEnergy_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kComputeInducedDipoleForceAndEnergy");
cudaComputeAmoebaMapTorquesAndAddTotalForce2(amoebaGpu, amoebaGpu->psTorque, gpu->psForce4);
LAUNCHERROR("cudaComputeAmoebaMapTorquesAndAddTotalForce2_kCalculateAmoebaPMEInducedDipoleForces");
cudaComputeAmoebaMapTorqueAndAddToForce(amoebaGpu, amoebaGpu->psTorque );
}
......@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
......@@ -91,7 +92,7 @@ __device__ void sumTempBuffer( PmeDirectElectrostaticParticle& atomI, PmeDirectE
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
float4* debugArray, float4* pullBack )
{
unsigned int index = atomI + atomJ*cAmoebaSim.paddedNumberOfAtoms;
unsigned int index = atomI + atomJ*cSim.paddedNumberOfAtoms;
float blockId = 111.0f;
debugArray[index].x = (float) atomI;
......@@ -100,7 +101,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
debugArray[index].w = blockId;
for( int pullIndex = 0; pullIndex < 1; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
......@@ -1082,66 +1083,17 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
// reduce psWorkArray_3_1 -> torque
static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
}
/**---------------------------------------------------------------------------------------
Zero gpu->psForce4
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void zeroForce( amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
gpu->psForce4->_pSysData[ii].x = 0.0f;
gpu->psForce4->_pSysData[ii].y = 0.0f;
gpu->psForce4->_pSysData[ii].z = 0.0f;
}
gpu->psForce4->Upload();
}
/**---------------------------------------------------------------------------------------
Copy gpu->psForce4 to amoebaGpu->psForce
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
static void kReduceTorque(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
gpu->psForce4->Download();
int indexOffset = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
amoebaGpu->psForce->_pSysData[indexOffset] = conversion*(gpu->psForce4->_pSysData[ii].x);
amoebaGpu->psForce->_pSysData[indexOffset+1] = conversion*(gpu->psForce4->_pSysData[ii].y);
amoebaGpu->psForce->_pSysData[indexOffset+2] = conversion*(gpu->psForce4->_pSysData[ii].z);
indexOffset += 3;
}
amoebaGpu->psForce->Upload();
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData );
LAUNCHERROR("kReducePmeDirectElectrostaticTorque");
}
//#define GET_INDUCED_DIPOLE_FROM_FILE
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
#include <stdlib.h>
#endif
/**---------------------------------------------------------------------------------------
Compute Amoeba dirrect space portion of electrostatic force & torque
......@@ -1153,10 +1105,6 @@ static void copyForce( amoebaGpuContext amoebaGpu, float conversion )
void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
static int timestep = 0;
......@@ -1186,42 +1134,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
unsigned int targetAtom = 49;
#endif
#ifdef GET_INDUCED_DIPOLE_FROM_FILE
//std::string fileName = "waterInducedDipole.txt";
std::string fileName = "water_3_MI.txt";
StringVectorVector fileContents;
readFile( fileName, fileContents );
unsigned int offset = 0;
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush( amoebaGpu->log );
}
for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
StringVector lineTokens = fileContents[ii];
unsigned int lineTokenIndex = 1;
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipole->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psInducedDipolePolar->_pSysData[offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
}
float conversion = 0.1f;
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
amoebaGpu->psInducedDipole->_pSysData[ii] *= conversion;
amoebaGpu->psInducedDipolePolar->_pSysData[ii] *= conversion;
}
amoebaGpu->gpuContext->sim.alphaEwald = 5.4459052e+00f;
SetCalculateAmoebaPmeDirectElectrostaticSim(amoebaGpu);
amoebaGpu->psInducedDipole->Upload();
amoebaGpu->psInducedDipolePolar->Upload();
#endif
// on first pass, set threads/block
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
......@@ -1233,16 +1148,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
}
kClearFields_3( amoebaGpu, 2 );
kClearFields_3( amoebaGpu, 1 );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)),
sizeof(PmeDirectElectrostaticParticle) );
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
(void) fflush( amoebaGpu->log );
......@@ -1251,136 +1162,30 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
} else {
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
debugArray->_pDevData, targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevData );
amoebaGpu->psWorkArray_3_1->_pDevData );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
kReduceForceTorque( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
int maxPrint = 5;
float conversion = 1.0f/41.84f;
float forceSum[3] = { 0.0f, 0.0f, 0.0f};
(void) fprintf( amoebaGpu->log, "Finished PmeDirectElectrostatic kernel execution conversion=%.5f\n", conversion ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticF [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psForce->_pSysData[indexOffset],
conversion*amoebaGpu->psForce->_pSysData[indexOffset+1],
conversion*amoebaGpu->psForce->_pSysData[indexOffset+2] );
forceSum[0] += amoebaGpu->psForce->_pSysData[indexOffset];
forceSum[1] += amoebaGpu->psForce->_pSysData[indexOffset+1];
forceSum[2] += amoebaGpu->psForce->_pSysData[indexOffset+2];
// torque
(void) fprintf( amoebaGpu->log,"PmeDirectElectrostaticT [%16.9e %16.9e %16.9e] ",
conversion*amoebaGpu->psTorque->_pSysData[indexOffset],
conversion*amoebaGpu->psTorque->_pSysData[indexOffset+1],
conversion*amoebaGpu->psTorque->_pSysData[indexOffset+2] );
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
(void) fflush( amoebaGpu->log );
if( 0 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
float torqueSum0[3] = { 0.0f, 0.0f, 0.0f};
float torqueSum1[3] = { 0.0f, 0.0f, 0.0f};
int offset0 = 7*paddedNumberOfAtoms;
int offset1 = 8*paddedNumberOfAtoms;
int offset2 = 7*paddedNumberOfAtoms;
int offset3 = 8*paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
if( fabs( debugArray->_pSysData[debugIndex+5*paddedNumberOfAtoms].y ) < 1.0e-10 )continue;
if( jj != targetAtom ){
torqueSum0[0] += debugArray->_pSysData[debugIndex+offset0].x + debugArray->_pSysData[debugIndex+offset1].x;
torqueSum0[1] += debugArray->_pSysData[debugIndex+offset0].y + debugArray->_pSysData[debugIndex+offset1].y;
torqueSum0[2] += debugArray->_pSysData[debugIndex+offset0].z + debugArray->_pSysData[debugIndex+offset1].z;
torqueSum1[0] += debugArray->_pSysData[debugIndex+offset2].x + debugArray->_pSysData[debugIndex+offset3].x;
torqueSum1[1] += debugArray->_pSysData[debugIndex+offset2].y + debugArray->_pSysData[debugIndex+offset3].y;
torqueSum1[2] += debugArray->_pSysData[debugIndex+offset2].z + debugArray->_pSysData[debugIndex+offset3].z;
}
if( jj == 2 ){
offset0 += 2*paddedNumberOfAtoms;
offset1 += 2*paddedNumberOfAtoms;
}
for( int kk = 0; kk < 12; kk++ ){
(void) fprintf( amoebaGpu->log,"%5d %5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj, kk,
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] Sum\n", targetAtom, jj,
torqueSum0[0], torqueSum0[1], torqueSum0[2], torqueSum1[0], torqueSum1[1], torqueSum1[2]);
(void) fprintf( amoebaGpu->log,"\n" );
}
}
(void) fflush( amoebaGpu->log );
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForceTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, gpu->psForce4 );
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
copyForce( amoebaGpu, -1.0f/41.84f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectForce", fileId, outputVector );
}
kReduceTorque( amoebaGpu );
cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
}
......@@ -1391,100 +1196,9 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
extern double kReduceEnergy( gpuContext gpu);
void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
float conversion = -1.0f/41.84;
copyForce( amoebaGpu, conversion );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDemForce", fileId, outputVector );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float>* debugArray = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "FArray");
int index = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
debugArray->_pSysData[index] = amoebaGpu->psForce->_pSysData[index];
debugArray->_pSysData[index+1] = amoebaGpu->psForce->_pSysData[index+1];
debugArray->_pSysData[index+2] = amoebaGpu->psForce->_pSysData[index+2];
index += 3;
}
//zeroForce( amoebaGpu );
double dem = kReduceEnergy( gpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
double dep = kReduceEnergy( gpu );
fprintf( stderr, "Recip Em=%15.7e ep=%15.7e ttl=%15.7e", dem/4.184, (dep-dem)/4.184, dep/4.184 );
copyForce( amoebaGpu, conversion );
VectorOfDoubleVectors outputVector1;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector1, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipForce", fileId, outputVector1 );
VectorOfDoubleVectors outputVector2;
index = 0;
for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
amoebaGpu->psForce->_pSysData[index] -= debugArray->_pSysData[index];
amoebaGpu->psForce->_pSysData[index+1] -= debugArray->_pSysData[index+1];
amoebaGpu->psForce->_pSysData[index+2] -= debugArray->_pSysData[index+2];
index += 3;
}
amoebaGpu->psForce->Upload();
outputVector.resize(0);
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector2, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeRecipDepForce", fileId, outputVector2 );
zeroForce( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
//zeroForce( amoebaGpu );
exit(0);
}
#endif
#ifdef AMOEBA_DEBUG
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "yCudaPmeDirectForce", fileId, outputVector );
zeroForce( amoebaGpu );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
zeroForce( amoebaGpu );
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeForce", fileId, outputVector );
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
copyForce( amoebaGpu, -1.0f/41.84f );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePmeForce", fileId, outputVector );
}
#endif
cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
}
......
......@@ -35,13 +35,12 @@ __launch_bounds__(128, 1)
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned int* workUnit, float* outputForce, float* outputTorque
unsigned int* workUnit, float* outputTorque
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int maxPullIndex = 7;
float4 pullBack[12];
......@@ -103,7 +102,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
if (bExclusionFlag)
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -136,7 +135,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
if( (atomI != atomJ) && (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
{
localParticle.force[0] += forceTorqueEnergy[0].x;
localParticle.force[1] += forceTorqueEnergy[0].y;
......@@ -154,7 +153,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
float blockId = 1.0f;
......@@ -163,28 +162,28 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].z = (float) y;
debugArray[index].w = blockId;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
float offsetF = (float)(3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
float offsetF = (float)(3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
......@@ -197,7 +196,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// include self energy and self torque
if( atomI < cAmoebaSim.numberOfAtoms ){
if( atomI < cSim.atoms ){
calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
float energy;
calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
......@@ -207,41 +206,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
add3dArray( 3*offset, localParticle.torque, outputTorque );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
unsigned int offset = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
#endif
} else {
......@@ -271,7 +242,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
{
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
......@@ -306,7 +277,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// check if atoms out-of-bounds
if( (atomI < cAmoebaSim.numberOfAtoms) && (atomJ < cAmoebaSim.numberOfAtoms) )
if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
{
// add force and torque to atom I due atom J
......@@ -372,7 +343,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef AMOEBA_DEBUG
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
......@@ -381,29 +352,29 @@ if( atomI == targetAtom || atomJ == targetAtom ){
debugArray[index].z = (float) y;
debugArray[index].w = (flags == 0xFFFFFFFF) ? (float) -141.0f : -151.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[0].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[0].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[0].z : 0.0f;
debugArray[index].w = mask ? forceTorqueEnergy[0].w : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[1].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[1].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[1].z : 0.0f;
float offsetF = (float)(3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
float offsetF = (float)(3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceTorqueEnergy[2].x : 0.0f;
debugArray[index].y = mask ? forceTorqueEnergy[2].y : 0.0f;
debugArray[index].z = mask ? forceTorqueEnergy[2].z : 0.0f;
offsetF = (float) (3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms));
offsetF = (float) (3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms));
debugArray[index].w = offsetF;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
......@@ -419,78 +390,23 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
unsigned int offset = (x + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
add3dArray( 3*offset, localParticle.torque, outputTorque );
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += sA[threadIdx.x].force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += sA[threadIdx.x].force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += sA[threadIdx.x].force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += sA[threadIdx.x].torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += sA[threadIdx.x].torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of;
offset = (y + tgx + warp*cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
add3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
load3dArray( 3*offset, localParticle.torque, outputTorque );
outputForce[offset] = sA[threadIdx.x].force[0];
outputForce[offset+1] = sA[threadIdx.x].force[1];
outputForce[offset+2] = sA[threadIdx.x].force[2];
offset = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
outputTorque[offset] = sA[threadIdx.x].torque[0];
outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2];
add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4 );
load3dArray( 3*offset, sA[threadIdx.x].torque, outputTorque );
#endif
lasty = y;
......
......@@ -3,6 +3,7 @@
//-----------------------------------------------------------------------------------------
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
......@@ -135,17 +136,19 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
// E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self
kReducePmeEFieldPolar_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReducePmeE_Fields1");
// E_Field = E_Field (reciprocal) + E_Field (direct) + self
kReducePmeEField_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReducePmeE_Fields2");
}
......@@ -362,7 +365,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
static int isNanOrInfinity( double number ){
return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0;
}
#endif
static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
{
......@@ -390,7 +392,8 @@ static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int le
if(test==0) break; /*will exit if the list is sorted!*/
} /*end for i*/
}/*end bubbleSort*/
}
#endif
/**---------------------------------------------------------------------------------------
......@@ -438,7 +441,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -448,7 +451,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
amoebaGpu->psWorkArray_3_2->_pDevData );
#endif
} else {
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -469,15 +472,15 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
(sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
(void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
(void) fflush( amoebaGpu->log );
/*
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n", amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers );
(void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2] paddedNumberOfAtoms=%d\n", gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download();
for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
......@@ -624,33 +627,13 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
kReduceForces( gpu );
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
//exit(0);
cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
//cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
exit(0);
}
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
}
if( 0 ){
cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaDirectEField", fileId, outputVector );
}
}
......@@ -101,7 +101,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
if( bExclusionFlag ){
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
......@@ -127,7 +127,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
unsigned int match = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
unsigned int match = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
......@@ -151,37 +151,37 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].z;
debugArray[index].y = match ? 0.0f : ijField[1].z;
debugArray[index].z = match ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 2.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].y;
debugArray[index].y = match ? 0.0f : ijField[1].y;
debugArray[index].z = match ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 3.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].w;
debugArray[index].y = match ? 0.0f : ijField[1].w;
debugArray[index].z = match ? 0.0f : ijField[2].w;
......@@ -189,7 +189,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
......@@ -204,11 +204,11 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif
......@@ -235,7 +235,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
if( bExclusionFlag ) {
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
......@@ -258,7 +258,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
#endif
);
unsigned int outOfBounds = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
unsigned int outOfBounds = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
......@@ -331,44 +331,44 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
float flag = 9.0f;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].x;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].x;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].x;
debugArray[index].w = flag + 1.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].y;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].y;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].y;
debugArray[index].w = flag + 2.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].z;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].z;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].z;
debugArray[index].w = flag + 3.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[0].w;
debugArray[index].y = outOfBounds ? 0.0f : ijField[1].w;
debugArray[index].z = outOfBounds ? 0.0f : ijField[2].w;
debugArray[index].w = flag + 4.0f;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
......@@ -384,20 +384,20 @@ if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
......
......@@ -354,13 +354,14 @@ static void kSorUpdateMutualInducedField_kernel(
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
LAUNCHERROR("kReducePmeMI_Fields2");
}
......@@ -410,21 +411,22 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" );
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevData,
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -436,17 +438,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
} else {
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Cutoff no warp\n" );
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
#ifdef AMOEBA_DEBUG
......@@ -596,7 +588,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0;
int maxPrint = 10;
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii,
amoebaGpu->psPolarizability->_pSysData[offset] );
......
......@@ -110,7 +110,7 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -136,51 +136,51 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].w = 6.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].x;
debugArray[index].y = ijField[1].x;
debugArray[index].z = ijField[2].x;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[0].z;
debugArray[index].y = ijField[1].z;
debugArray[index].z = ijField[2].z;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[0].x;
debugArray[index].y = match ? 0.0f : ijField[1].x;
debugArray[index].z = match ? 0.0f : ijField[2].x;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
......@@ -199,12 +199,12 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
......@@ -245,7 +245,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
......@@ -323,39 +323,39 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].w = 7.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
......@@ -372,21 +372,21 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].field, outputField );
load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
......
......@@ -77,60 +77,41 @@ void kCudaComputeCheckChiral_kernel( void )
const int C = 3;
float delta[4][3];
float4* atomCoord = cSim.pPosq;
int4* multiPoleAtoms = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
float* molecularDipole = cAmoebaSim.pMolecularDipole;
float* molecularQuadrupole = cAmoebaSim.pMolecularQuadrupole;
float4* particleCoord = cSim.pPosq;
int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
float* labFrameDipole = cAmoebaSim.pLabFrameDipole;
float* labFrameQuadrupole = cAmoebaSim.pLabFrameQuadrupole;
// ---------------------------------------------------------------------------------------
int atomIndex = blockIdx.x;
if( atomIndex >= cSim.atoms )return;
int axisType = multiPoleAtoms[atomIndex].w;
float* molDipole = &(molecularDipole[atomIndex*3]);
float* labDipole = &(labFrameDipole[atomIndex*3]);
labDipole[0] = molDipole[0];
labDipole[1] = molDipole[1];
labDipole[2] = molDipole[2];
float* molQuadrupole = &(molecularQuadrupole[atomIndex*9]);
float* labQuadrupole = &(labFrameQuadrupole[atomIndex*9]);
labQuadrupole[0] = molQuadrupole[0];
labQuadrupole[1] = molQuadrupole[1];
labQuadrupole[2] = molQuadrupole[2];
labQuadrupole[3] = molQuadrupole[3];
labQuadrupole[4] = molQuadrupole[4];
labQuadrupole[5] = molQuadrupole[5];
labQuadrupole[6] = molQuadrupole[6];
labQuadrupole[7] = molQuadrupole[7];
labQuadrupole[8] = molQuadrupole[8];
int particleIndex = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int numberOfParticles = cSim.atoms;
while( particleIndex < numberOfParticles )
{
// skip z-then-x
if( axisType == 0 || multiPoleAtoms[atomIndex].y < 0 )return;
int axisType = multiPoleParticles[particleIndex].w;
if( axisType != 0 && multiPoleParticles[particleIndex].y >= 0 )
{
// ---------------------------------------------------------------------------------------
int atomA = atomIndex;
int atomB = multiPoleAtoms[atomIndex].z;
int atomC = multiPoleAtoms[atomIndex].x;
int atomD = multiPoleAtoms[atomIndex].y;
int particleA = particleIndex;
int particleB = multiPoleParticles[particleIndex].z;
int particleC = multiPoleParticles[particleIndex].x;
int particleD = multiPoleParticles[particleIndex].y;
delta[AD][0] = atomCoord[atomA].x - atomCoord[atomD].x;
delta[AD][1] = atomCoord[atomA].y - atomCoord[atomD].y;
delta[AD][2] = atomCoord[atomA].z - atomCoord[atomD].z;
delta[AD][0] = particleCoord[particleA].x - particleCoord[particleD].x;
delta[AD][1] = particleCoord[particleA].y - particleCoord[particleD].y;
delta[AD][2] = particleCoord[particleA].z - particleCoord[particleD].z;
delta[BD][0] = atomCoord[atomB].x - atomCoord[atomD].x;
delta[BD][1] = atomCoord[atomB].y - atomCoord[atomD].y;
delta[BD][2] = atomCoord[atomB].z - atomCoord[atomD].z;
delta[BD][0] = particleCoord[particleB].x - particleCoord[particleD].x;
delta[BD][1] = particleCoord[particleB].y - particleCoord[particleD].y;
delta[BD][2] = particleCoord[particleB].z - particleCoord[particleD].z;
delta[CD][0] = atomCoord[atomC].x - atomCoord[atomD].x;
delta[CD][1] = atomCoord[atomC].y - atomCoord[atomD].y;
delta[CD][2] = atomCoord[atomC].z - atomCoord[atomD].z;
delta[CD][0] = particleCoord[particleC].x - particleCoord[particleD].x;
delta[CD][1] = particleCoord[particleC].y - particleCoord[particleD].y;
delta[CD][2] = particleCoord[particleC].z - particleCoord[particleD].z;
delta[C][0] = delta[BD][1]*delta[CD][2] - delta[BD][2]*delta[CD][1];
delta[C][1] = delta[CD][1]*delta[AD][2] - delta[CD][2]*delta[AD][1];
......@@ -138,13 +119,16 @@ void kCudaComputeCheckChiral_kernel( void )
float volume = delta[C][0]*delta[AD][0] + delta[C][1]*delta[BD][0] + delta[C][2]*delta[CD][0];
if( volume < 0.0 ){
labDipole[1] *= -1.0f; // pole(3,i)
labQuadrupole[1] *= -1.0f; // pole(6,i) && pole(8,i)
labQuadrupole[3] *= -1.0f; // pole(10,i) && pole(12,i)
labQuadrupole[5] *= -1.0f; // pole(6,i) && pole(8,i)
labQuadrupole[7] *= -1.0f; // pole(10,i) && pole(12,i)
labFrameDipole[particleIndex*3+1] *= -1.0f; // pole(3,i)
labFrameQuadrupole[particleIndex*9+1] *= -1.0f; // pole(6,i) && pole(8,i)
labFrameQuadrupole[particleIndex*9+3] *= -1.0f; // pole(10,i) && pole(12,i)
labFrameQuadrupole[particleIndex*9+5] *= -1.0f; // pole(6,i) && pole(8,i)
labFrameQuadrupole[particleIndex*9+7] *= -1.0f; // pole(10,i) && pole(12,i)
}
}
particleIndex += gridDim.x*blockDim.x;
}
}
__global__
......@@ -162,18 +146,14 @@ void kCudaComputeLabFrameMoments_kernel( void )
float vectorY[3];
float vectorZ[3];
int numOfAtoms = cSim.atoms;
float4* atomCoord = cSim.pPosq;
int4* multiPoleAtoms = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
int particleIndex = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int numberOfParticles = cSim.atoms;
float4* particleCoord = cSim.pPosq;
int4* multiPoleParticles = cAmoebaSim.pMultipoleParticlesIdsAndAxisType;
float* labFrameDipole = cAmoebaSim.pLabFrameDipole;
float* labFrameQuadrupole = cAmoebaSim.pLabFrameQuadrupole;
// ---------------------------------------------------------------------------------------
int atomIndex = blockIdx.x;
// ---------------------------------------------------------------------------------------
// get coordinates of this atom and the z & x axis atoms
// compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
// this atom and the axis atom
......@@ -182,24 +162,25 @@ void kCudaComputeLabFrameMoments_kernel( void )
// code common to ZThenX and Bisector
float4 coordinatesThisAtom = atomCoord[atomIndex];
while( particleIndex < numberOfParticles )
{
float4 coordinatesThisParticle = particleCoord[particleIndex];
int multipoleAtomIndex = multiPoleAtoms[atomIndex].z;
float4 coordinatesAxisAtom = atomCoord[multipoleAtomIndex];
int multipoleParticleIndex = multiPoleParticles[particleIndex].z;
float4 coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
vectorZ[0] = coordinatesAxisAtom.x - coordinatesThisAtom.x;
vectorZ[1] = coordinatesAxisAtom.y - coordinatesThisAtom.y;
vectorZ[2] = coordinatesAxisAtom.z - coordinatesThisAtom.z;
vectorZ[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x;
vectorZ[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y;
vectorZ[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z;
multipoleAtomIndex = multiPoleAtoms[atomIndex].x;
coordinatesAxisAtom = atomCoord[multipoleAtomIndex];
multipoleParticleIndex = multiPoleParticles[particleIndex].x;
coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
vectorX[0] = coordinatesAxisAtom.x - coordinatesThisAtom.x;
vectorX[1] = coordinatesAxisAtom.y - coordinatesThisAtom.y;
vectorX[2] = coordinatesAxisAtom.z - coordinatesThisAtom.z;
int axisType = multiPoleAtoms[atomIndex].w;
vectorX[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x;
vectorX[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y;
vectorX[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z;
int axisType = multiPoleParticles[particleIndex].w;
/*
z-only
......@@ -262,11 +243,11 @@ void kCudaComputeLabFrameMoments_kernel( void )
// z-bisect
multipoleAtomIndex = multiPoleAtoms[atomIndex].y;
coordinatesAxisAtom = atomCoord[multipoleAtomIndex];
vectorY[0] = coordinatesAxisAtom.x - coordinatesThisAtom.x;
vectorY[1] = coordinatesAxisAtom.y - coordinatesThisAtom.y;
vectorY[2] = coordinatesAxisAtom.z - coordinatesThisAtom.z;
multipoleParticleIndex = multiPoleParticles[particleIndex].y;
coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
vectorY[0] = coordinatesAxisParticle.x - coordinatesThisParticle.x;
vectorY[1] = coordinatesAxisParticle.y - coordinatesThisParticle.y;
vectorY[2] = coordinatesAxisParticle.z - coordinatesThisParticle.z;
sum = normVector3( vectorY );
sum = normVector3( vectorX );
......@@ -327,71 +308,64 @@ void kCudaComputeLabFrameMoments_kernel( void )
}
float molDipole[3];
float* labDipole = &(labFrameDipole[atomIndex*3]);
molDipole[0] = labDipole[0];
molDipole[1] = labDipole[1];
molDipole[2] = labDipole[2];
molDipole[0] = labFrameDipole[particleIndex*3];
molDipole[1] = labFrameDipole[particleIndex*3+1];
molDipole[2] = labFrameDipole[particleIndex*3+2];
// set out-of-range elements to 0.0f
labDipole[0] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
labDipole[1] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
labDipole[2] = atomIndex >= numOfAtoms ? 0.0f : molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
labFrameDipole[particleIndex*3] = molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
labFrameDipole[particleIndex*3+1] = molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
labFrameDipole[particleIndex*3+2] = molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
// ---------------------------------------------------------------------------------------
float* rPole[3];
float mPole[3][3];
float* labQuadrupole = &(labFrameQuadrupole[atomIndex*9]);
unsigned int offset = particleIndex*9;
for( int ii = 0; ii < 3; ii++ ){
mPole[ii][0] = labQuadrupole[3*ii+0];
mPole[ii][1] = labQuadrupole[3*ii+1];
mPole[ii][2] = labQuadrupole[3*ii+2];
mPole[0][0] = labFrameQuadrupole[offset];
mPole[0][1] = labFrameQuadrupole[offset+1];
mPole[0][2] = labFrameQuadrupole[offset+2];
rPole[ii] = labQuadrupole + ii*3;
rPole[ii][0] = 0.0f;
rPole[ii][1] = 0.0f;
rPole[ii][2] = 0.0f;
mPole[1][0] = labFrameQuadrupole[offset+3];
mPole[1][1] = labFrameQuadrupole[offset+4];
mPole[1][2] = labFrameQuadrupole[offset+5];
}
mPole[2][0] = labFrameQuadrupole[offset+6];
mPole[2][1] = labFrameQuadrupole[offset+7];
mPole[2][2] = labFrameQuadrupole[offset+8];
int ii = threadIdx.x;
if( ii < 3 ){
for( int jj = ii; jj < 3; jj++ ){
labFrameQuadrupole[offset+8] = vectorX[2]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
labFrameQuadrupole[offset+8] += vectorY[2]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
labFrameQuadrupole[offset+8] += vectorZ[2]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
rPole[ii][jj] += vectorX[ii]*vectorX[jj]*mPole[0][0];
rPole[ii][jj] += vectorX[ii]*vectorY[jj]*mPole[0][1];
rPole[ii][jj] += vectorX[ii]*vectorZ[jj]*mPole[0][2];
labFrameQuadrupole[offset+4] = vectorX[1]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
labFrameQuadrupole[offset+4] += vectorY[1]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
labFrameQuadrupole[offset+4] += vectorZ[1]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
rPole[ii][jj] += vectorY[ii]*vectorX[jj]*mPole[1][0];
rPole[ii][jj] += vectorY[ii]*vectorY[jj]*mPole[1][1];
rPole[ii][jj] += vectorY[ii]*vectorZ[jj]*mPole[1][2];
labFrameQuadrupole[offset+5] = vectorX[1]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
labFrameQuadrupole[offset+5] += vectorY[1]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
labFrameQuadrupole[offset+5] += vectorZ[1]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
rPole[ii][jj] += vectorZ[ii]*vectorX[jj]*mPole[2][0];
rPole[ii][jj] += vectorZ[ii]*vectorY[jj]*mPole[2][1];
rPole[ii][jj] += vectorZ[ii]*vectorZ[jj]*mPole[2][2];
}
}
labFrameQuadrupole[offset] = vectorX[0]*(vectorX[0]*mPole[0][0] + vectorY[0]*mPole[0][1] + vectorZ[0]*mPole[0][2]);
labFrameQuadrupole[offset] += vectorY[0]*(vectorX[0]*mPole[1][0] + vectorY[0]*mPole[1][1] + vectorZ[0]*mPole[1][2]);
labFrameQuadrupole[offset] += vectorZ[0]*(vectorX[0]*mPole[2][0] + vectorY[0]*mPole[2][1] + vectorZ[0]*mPole[2][2]);
__syncthreads();
labFrameQuadrupole[offset+1] = vectorX[0]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
labFrameQuadrupole[offset+1] += vectorY[0]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
labFrameQuadrupole[offset+1] += vectorZ[0]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
labFrameQuadrupole[offset+2] = vectorX[0]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
labFrameQuadrupole[offset+2] += vectorY[0]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
labFrameQuadrupole[offset+2] += vectorZ[0]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
rPole[1][0] = rPole[0][1];
rPole[2][0] = rPole[0][2];
rPole[2][1] = rPole[1][2];
labFrameQuadrupole[offset+3] = labFrameQuadrupole[offset+1];
labFrameQuadrupole[offset+6] = labFrameQuadrupole[offset+2];
labFrameQuadrupole[offset+7] = labFrameQuadrupole[offset+5];
// set out-of-range elements to 0.0f
particleIndex += gridDim.x*blockDim.x;
}
labQuadrupole[0] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[0];
labQuadrupole[1] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[1];
labQuadrupole[2] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[2];
labQuadrupole[3] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[3];
labQuadrupole[4] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[4];
labQuadrupole[5] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[5];
labQuadrupole[6] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[6];
labQuadrupole[7] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[7];
labQuadrupole[8] = atomIndex >= numOfAtoms ? 0.0f : labQuadrupole[8];
}
void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
......@@ -405,8 +379,8 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
gpuContext gpu = amoebaGpu->gpuContext;
int numBlocks = amoebaGpu->paddedNumberOfAtoms;
int numThreads = 20;
int numBlocks = gpu->sim.blocks;
int numThreads = gpu->sim.update_threads_per_block;
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
......@@ -434,100 +408,19 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
double kernelTime = 0.0;
#endif
// copy molecular moments to lab frame moment arrays
// check if chiral center requires moments to have sign flipped
// compute lab frame moments
cudaMemcpy( amoebaGpu->psLabFrameDipole->_pDevData, amoebaGpu->psMolecularDipole->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psLabFrameQuadrupole->_pDevData, amoebaGpu->psMolecularQuadrupole->_pDevData, 9*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
kCudaComputeCheckChiral_kernel<<< numBlocks, numThreads>>> ( );
LAUNCHERROR("kCudaComputeCheckChiral");
kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( );
LAUNCHERROR(methodName);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
static int timestep = 0;
timestep++;
(void) fprintf( amoebaGpu->log, "Finished rotation kernel execution in %lf us\n", kernelTime ); (void) fflush( amoebaGpu->log );
(void) fflush( amoebaGpu->log );
amoebaGpu->psLabFrameDipole->Download();
(void) fprintf( amoebaGpu->log, "psLabFrameDipole completed\n" ); (void) fflush( amoebaGpu->log );
amoebaGpu->psLabFrameQuadrupole->Download();
(void) fprintf( amoebaGpu->log, "psLabFrameQpole completed\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 20;
for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
int dipoleOffset = 3*ii;
int quadrupoleOffset = 9*ii;
(void) fprintf( amoebaGpu->log,"\n%6d [%6d %6d %6d] ", ii,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w );
// coords
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e]\n",
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z);
/*
(void) fprintf( amoebaGpu->log," R[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+1],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+2],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+3],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+4],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+5],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+6],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+7],
amoebaGpu->psRotationMatrix->_pSysData[quadrupoleOffset+8] );
*/
// dipole
(void) fprintf( amoebaGpu->log," D[%16.9e %16.9e %16.9e]\n",
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset],
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+1],
amoebaGpu->psLabFrameDipole->_pSysData[dipoleOffset+2] );
// quadrupole
(void) fprintf( amoebaGpu->log," Q[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n",
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+1],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+2],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+3],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+4],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+5],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+6],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+7],
amoebaGpu->psLabFrameQuadrupole->_pSysData[quadrupoleOffset+8] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint)) ){
ii = gpu->natoms - maxPrint;
}
}
int nansDetected = checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->psLabFrameDipole );
nansDetected += checkForNansAndInfinities( amoebaGpu->paddedNumberOfAtoms*9, amoebaGpu->psLabFrameQuadrupole );
if( nansDetected ){
(void) fprintf( amoebaGpu->log,"Nans detected in dipole/quadrupoles.\n" );
exit(0);
}
(void) fflush( amoebaGpu->log );
}
#endif
if( 0 ){
int particles = amoebaGpu->paddedNumberOfAtoms;
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( particles, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( particles, 3, amoebaGpu->psLabFrameDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloatArray( particles, 9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "CudaRotatedMoments", fileId, outputVector );
}
}
void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood )
......@@ -536,21 +429,9 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// compute lab frame moments
if( 0 ){
static int iteration = 0;
gpuContext gpu = amoebaGpu->gpuContext;
checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPreLabCoord", stderr );
}
if( 0 ){
static int iteration = 0;
gpuContext gpu = amoebaGpu->gpuContext;
checkForNansFloat4( gpu->natoms, gpu->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPreForce", stderr );
}
cudaComputeAmoebaLabFrameMoments( amoebaGpu );
if( 0 ){
if( 1 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
//fileId.push_back( 0 );
......@@ -567,10 +448,14 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEAndGkFields( amoebaGpu );
cudaComputeAmoebaMutualInducedAndGkField( amoebaGpu );
} else {
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaFixedEField( amoebaGpu );
cudaComputeAmoebaMutualInducedField( amoebaGpu );
} else {
gpuContext gpu = amoebaGpu->gpuContext;
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
......@@ -581,6 +466,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
}
......@@ -597,33 +483,10 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// calculate electrostatic forces
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaElectrostatic( amoebaGpu );
// map torques to forces
cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpu, amoebaGpu->psTorque, amoebaGpu->psForce, amoebaGpu->gpuContext->psForce4 );
if( 0 ){
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaLoadCudaFloat4Array( gpu->natoms, 3, amoebaGpu->gpuContext->psForce4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184 );
cudaWriteVectorOfDoubleVectorsToFile( "CudaMpole", fileId, outputVector );
}
} else {
cudaComputeAmoebaPmeElectrostatic( amoebaGpu );
}
if( 0 ){
static int iteration = 0;
gpuContext gpu = amoebaGpu->gpuContext;
checkForNansFloat4( gpu->natoms, gpu->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "MultipoleForcesPstForce", stderr );
}
}
#undef AMOEBA_DEBUG
......@@ -117,23 +117,27 @@ void kClearFields_kernel( unsigned int bufferLength, float* EField )
void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
{
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData );
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData );
LAUNCHERROR("kClearFields_3_1");
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData );
if( numberToClear > 1 ){
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_2->_pDevData );
LAUNCHERROR("kClearFields_3_2");
} else {
return;
}
if( numberToClear > 2 ){
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData );
LAUNCHERROR("kClearFields_3_3");
} else {
return;
}
if( numberToClear > 3 ){
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*3*amoebaGpu->outputBuffers,
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData );
LAUNCHERROR("kClearFields_3_4");
}
......@@ -144,11 +148,12 @@ void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
void kClearFields_1( amoebaGpuContext amoebaGpu )
{
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData );
LAUNCHERROR("kClearFields_1_1");
kClearFields_kernel<<<amoebaGpu->nonbondBlocks, 384>>>( amoebaGpu->paddedNumberOfAtoms*amoebaGpu->outputBuffers,
kClearFields_kernel<<<gpu->sim.nonbond_blocks, 384>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevData );
LAUNCHERROR("kClearFields_1_2");
}
......
......@@ -449,8 +449,9 @@ static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDA
static void kReduceVdw14_7(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceVdw14_7");
}
......@@ -545,13 +546,13 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
(void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
gpu->sim.nonbond_blocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
if( 0 ){
gpu->psInteractionCount->Download();
amoebaGpu->psVdwWorkUnit->Download();
unsigned int totalWarps = (amoebaGpu->nonbondBlocks*threadsPerBlock)/GRID;
unsigned int totalWarps = (gpu->sim.nonbond_blocks*threadsPerBlock)/GRID;
float ratiof = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
(void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
......@@ -610,7 +611,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
......@@ -624,7 +625,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
} else {
kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
......@@ -644,7 +645,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
amoebaGpu->psVdwWorkUnit->_pDevData,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
......@@ -658,7 +659,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
#endif
} else {
kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
amoebaGpu->psVdwWorkUnit->_pDevData,
amoebaGpu->psAmoebaVdwCoordinates->_pDevData,
amoebaGpu->psVdwSigmaEpsilon->_pDevData,
......@@ -707,7 +708,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
amoebaGpu->psWorkArray_3_1->Download();
//for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
for( int kk = 0; kk < amoebaGpu->outputBuffers; kk++ ){
for( int kk = 0; kk < gpu->sim.outputBuffers; kk++ ){
float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] ) )
(void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
......
......@@ -95,7 +95,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
if( bExclusionFlag ){
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
exclusionIndex = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
exclusionMask = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
}
......@@ -136,7 +136,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
);
// mask out excluded ixns
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
if( mask && bExclusionFlag ){
unsigned int maskIndex = 1 << j;
mask = (exclusionMask & maskIndex) ? 0 : 1;
......@@ -158,25 +158,25 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray[index].z = -1.0f;
debugArray[index].w = (float) (mask + 1);
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = energy;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijForce[0] : 0.0f;
debugArray[index].y = mask ? ijForce[1] : 0.0f;
debugArray[index].z = mask ? ijForce[2] : 0.0f;
......@@ -189,11 +189,11 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, forceSum, outputForce );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, forceSum, outputForce );
#endif
......@@ -268,7 +268,7 @@ flags = 0xFFFFFFFF;
// mask out excluded ixns
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
unsigned int mask = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 0 : 1;
if( mask && bExclusionFlag ){
unsigned int maskIndex = 1 << jIdx;
mask = (exclusionMask & maskIndex) ? 0 : 1;
......@@ -337,25 +337,25 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
debugArray[index].z = -3.0;
debugArray[index].w = (float) (mask + 1);
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = energy;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? ijForce[0] : 0.0f;
debugArray[index].y = mask ? ijForce[1] : 0.0f;
debugArray[index].z = mask ? ijForce[2] : 0.0f;
......@@ -375,17 +375,17 @@ if( atomI == targetAtom || (y+jIdx) == targetAtom ){
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, forceSum, outputForce );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, outputForce );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, forceSum, outputForce );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].force, outputForce );
#endif
......
......@@ -357,8 +357,9 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceWcaDispersion");
}
......@@ -367,8 +368,9 @@ static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>*
static void kReduceWcaDispersionToFloat4(amoebaGpuContext amoebaGpu, CUDAStream<float4>* outputArray )
{
kReduceFieldsToFloat4_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFieldsToFloat4_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
LAUNCHERROR("kReduceWcaDispersion");
}
......@@ -417,7 +419,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
......@@ -425,7 +427,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
......@@ -438,7 +440,7 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
} else {
kCalculateAmoebaWcaDispersionN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevData,
gpu->psPosq4->_pDevData,
amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
......
......@@ -137,43 +137,43 @@ unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
//debugArray[index].z = (float) cAmoebaSim.numberOfAtoms;
//debugArray[index].z = (float) cSim.atoms;
debugArray[index].z = atomI == (y+tj) ? 0.0f : energy;
energy = ( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ) ? (energy) : 0.0f;
energy = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ) ? (energy) : 0.0f;
debugArray[index].w = energy+totalEnergy;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = iCoord.x;
debugArray[index].y = iCoord.y;
debugArray[index].z = iCoord.z;
debugArray[index].w = (float) (blockIdx.x * blockDim.x + threadIdx.x);
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = jCoord.x;
debugArray[index].y = jCoord.y;
debugArray[index].z = jCoord.z;
debugArray[index].w = -4.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = emixo;
debugArray[index].y = emixh;
debugArray[index].z = rmixo;
debugArray[index].w = rmixh;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[2].x;
debugArray[index].y = pullDebug[2].y;
debugArray[index].z = pullDebug[2].z;
......@@ -184,7 +184,7 @@ debugArray[index].w = pullDebug[2].w;
// energy = 0.0f;
}
#endif
if( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
// add to field at atomI the field due atomJ's dipole
......@@ -219,34 +219,34 @@ debugArray[index].w = pullDebug[2].w;
if( (atomI == targetAtom) || ( (y+tj) == targetAtom ) ){
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
index += 2*cAmoebaSim.paddedNumberOfAtoms;
index += 2*cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = atomI == (y+tj) ? 0.0f : energy;
energy = ( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ) ? (energy) : 0.0f;
energy = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ) ? (energy) : 0.0f;
debugArray[index].w = energy+totalEnergy;
//debugArray[index].w = -2.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = emjxo;
debugArray[index].y = emjxh;
debugArray[index].z = rmjxo;
debugArray[index].w = rmjxh;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cAmoebaSim.paddedNumberOfAtoms;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[2].x;
debugArray[index].y = pullDebug[2].y;
debugArray[index].z = pullDebug[2].z;
......@@ -257,7 +257,7 @@ debugArray[index].w = pullDebug[2].w;
//energy = 0.0f;
}
#endif
if( (atomI != (y+tj)) && (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
// add to field at atomI the field due atomJ's dipole
......@@ -281,18 +281,18 @@ debugArray[index].w = pullDebug[2].w;
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, forceSum, outputForce );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force, outputForce );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, forceSum, outputForce );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].force, outputForce );
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment