Commit 6bad9d44 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of several arrays no longer needed

parent 1beac75d
...@@ -30,20 +30,10 @@ ...@@ -30,20 +30,10 @@
#include "kernels/gputypes.h" #include "kernels/gputypes.h"
#include "amoebaCudaTypes.h" #include "amoebaCudaTypes.h"
#define THREADS_PER_BLOCK 256
#include <map> #include <map>
typedef std::map<int,float> MapIntFloat; typedef std::map<int,float> MapIntFloat;
typedef MapIntFloat::const_iterator MapIntFloatCI; typedef MapIntFloat::const_iterator MapIntFloatCI;
/*
* Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct _amoebaGpuContext { struct _amoebaGpuContext {
_gpuContext* gpuContext; _gpuContext* gpuContext;
...@@ -112,7 +102,6 @@ struct _amoebaGpuContext { ...@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
// multipole parameters // multipole parameters
CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType; CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
CUDAStream<int>* psMultipoleAxisOffset;
// buffer indices used for mapping torques onto forces // buffer indices used for mapping torques onto forces
...@@ -133,10 +122,10 @@ struct _amoebaGpuContext { ...@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
CUDAStream<float2>* psDampingFactorAndThole; CUDAStream<float2>* psDampingFactorAndThole;
// slated for removal -- no longer used // used to setup scaling constants
CUDAStream<int>* psCovalentDegree; std::vector<int> covalentDegree;
CUDAStream<int>* psPolarizationDegree; std::vector<int> polarizationDegree;
// fixed-E field // fixed-E field
......
...@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel( ...@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
float* inducedDipolePolarS ) float* inducedDipolePolarS )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
fixedEFieldPolar[threadId] *= polarizability[threadId];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId];
fixedGkField[threadId] *= polarizability[threadId]; fixedEField[pos] *= polarizability[pos];
inducedDipoleS[threadId] = fixedEField[threadId] + fixedGkField[threadId]; inducedDipole[pos] = fixedEField[pos];
inducedDipolePolarS[threadId] = fixedEFieldPolar[threadId] + fixedGkField[threadId];
fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[pos] = fixedEFieldPolar[pos];
fixedGkField[pos] *= polarizability[pos];
inducedDipoleS[pos] = fixedEField[pos] + fixedGkField[pos];
inducedDipolePolarS[pos] = fixedEFieldPolar[pos] + fixedGkField[pos];
pos += blockDim.x*gridDim.x;
}
} }
...@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel( ...@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); float previousDipole = inducedDipole[pos];
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); float previousDipoleP = inducedDipoleP[pos];
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
} }
__global__ __global__
...@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel( ...@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId]; float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[threadId]; float previousDipoleP = inducedDipoleP[pos];
inducedDipole[threadId] = fixedGkField[threadId] + fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId]; inducedDipole[pos] = fixedGkField[pos] + fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[threadId] = fixedGkField[threadId] + fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId]; inducedDipoleP[pos] = fixedGkField[pos] + fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
} }
// reduce psWorkArray_3_1 -> outputArray // reduce psWorkArray_3_1 -> outputArray
...@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu, ...@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
LAUNCHERROR("kReduceMutualInducedAndGkFields4"); LAUNCHERROR("kReduceMutualInducedAndGkFields4");
} }
#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
}
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
#endif
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute mutual induce field Compute mutual induce field
...@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu->psWorkArray_3_3->Download(); amoebaGpu->psWorkArray_3_3->Download();
amoebaGpu->psWorkArray_3_4->Download(); amoebaGpu->psWorkArray_3_4->Download();
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printMiFieldAtomBuffers( amoebaGpu, 100 );
//printMiFieldBuffer( amoebaGpu, 0 );
//printMiFieldBuffer( amoebaGpu, 1 );
//printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 );
if( amoebaGpu->log && iteration == 1 ){ if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
...@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psGk_Field->_pDevData, amoebaGpu->psGk_Field->_pDevData,
...@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe ...@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData ); amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1"); LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");
kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedAndGkFieldS_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData, amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
...@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel( ...@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar ) float* inducedDipolePolar )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[threadId] *= polarizability[threadId]; fixedEField[pos] *= polarizability[pos];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipole[pos] = fixedEField[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId]; fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId]; inducedDipolePolar[pos] = fixedEFieldPolar[pos];
pos += blockDim.x*gridDim.x;
}
} }
...@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel( ...@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
{ {
float polarSOR = 0.70f; float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); float previousDipole = inducedDipole[pos];
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP ); float previousDipoleP = inducedDipoleP[pos];
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
} }
...@@ -469,29 +477,14 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -469,29 +477,14 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
int done; int done;
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, gpu->natoms,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
...@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
...@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel( ...@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar ) float* inducedDipolePolar )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return; while( pos < 3*cSim.atoms )
{
fixedEField[threadId] *= polarizability[threadId]; fixedEField[pos] *= polarizability[pos];
inducedDipole[threadId] = fixedEField[threadId]; inducedDipole[pos] = fixedEField[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId]; fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId]; inducedDipolePolar[pos] = fixedEFieldPolar[pos];
pos += blockDim.x*gridDim.x;
}
} }
...@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel( ...@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
float* matrixProduct, float* matrixProductP ) float* matrixProduct, float* matrixProductP )
{ {
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return; while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
// add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[threadId] += term*previousDipole;
matrixProductP[threadId] += term*previousDipoleP;
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
const float polarSOR = 0.70f; float previousDipole = inducedDipole[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole ); float previousDipoleP = inducedDipoleP[pos];
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
// add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[pos] += term*previousDipole;
matrixProductP[pos] += term*previousDipoleP;
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
const float polarSOR = 0.70f;
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole ); pos += blockDim.x*gridDim.x;
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP ); }
} }
...@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
int iteration; int iteration;
gpuContext gpu = amoebaGpu->gpuContext; gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability // set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>( kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, gpu->natoms,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
...@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba ...@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData, gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include <stdio.h>
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#define AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
void cudaReduceN2ToN( float *N2Array, int Nsz, float *NArray, int includeDiagonal, int offset )
{
int numThreads = min(THREADS_PER_BLOCK, (Nsz));
int numBlocksPerAtom = (Nsz / numThreads);
if( Nsz % numThreads ){
numBlocksPerAtom++;
}
int numBlocks = numBlocksPerAtom*Nsz;
float *partialSum1_d;
// allocate GPU memory
cudaMalloc( (void**) &partialSum1_d, numBlocks*offset*sizeof(float) );
if( includeDiagonal ){
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel1");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel2");
}
} else {
if( offset == 3 ){
kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel3");
} else if( offset == 1 ){
kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel4");
}
}
int numBlocks2 = numBlocks;
numBlocks = numBlocks2*Nsz/numThreads;
if( (numBlocks2*Nsz) % numThreads ){
numBlocks++;
}
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal3");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal1");
}
//Free memory
cudaFree(partialSum1_d);
}
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
typedef unsigned int uint;
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
{
uint tid = threadIdx.x;
__shared__ float asx[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
#ifdef AMOEBA_OFFSET_3
__shared__ float asy[THREADS_PER_BLOCK];
__shared__ float asz[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
asy[tid] = asz[tid] = 0.0f;
int offset = 3;
#else
int offset = 1;
#endif
int atomI = blockIdx.x / numberOfBlocksPerAtom;
int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;
#ifdef AMOEBA_INCLUDE_DIAGONAL
if( atomJ < num && atomI < num ){
#else
if( atomJ < num && atomJ != atomI ){
#endif
int index = offset*(atomI*num + atomJ);
asx[tid] = N2Array[index];
#ifdef AMOEBA_OFFSET_3
asy[tid] = N2Array[index+1];
asz[tid] = N2Array[index+2];
#endif
}
__syncthreads(); //to make sure all the elements are loaded
for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
if( tid < s ){
asx[tid] += asx[tid+s];
#ifdef AMOEBA_OFFSET_3
asy[tid] += asy[tid+s];
asz[tid] += asz[tid+s];
#endif
}
__syncthreads();
}
if( tid == 0 ){
partialSum[blockIdx.x*offset] = asx[0];
#ifdef AMOEBA_OFFSET_3
partialSum[blockIdx.x*3+1] = asy[0];
partialSum[blockIdx.x*3+2] = asz[0];
#endif
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
{
uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( thread_id > num )return;
float3 sum;
#ifdef AMOEBA_OFFSET_3
int offset = 3;
sum.x = sum.y = sum.z = 0.0f;
#else
int offset = 1;
sum.x = 0.0f;
#endif
int index = thread_id*offset*numberOfBlocksPerAtom;
for( int i=0; i < numberOfBlocksPerAtom; i++ ){
sum.x += partialSum[index + i*offset];
#ifdef AMOEBA_OFFSET_3
sum.y += partialSum[index + i*offset+1];
sum.z += partialSum[index + i*offset+2];
#endif
}
final[thread_id*offset ] = sum.x;
#ifdef AMOEBA_OFFSET_3
final[thread_id*3+1 ] = sum.y;
final[thread_id*3+2 ] = sum.z;
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment