Commit 6bad9d44 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Removal of several arrays no longer needed

parent 1beac75d
......@@ -30,20 +30,10 @@
#include "kernels/gputypes.h"
#include "amoebaCudaTypes.h"
#define THREADS_PER_BLOCK 256
#include <map>
typedef std::map<int,float> MapIntFloat;
typedef MapIntFloat::const_iterator MapIntFloatCI;
/*
* Remove
* pMapArray, dMapArray, paddedNumberOfAtoms, nonbondBlocks, nonbondThreadsPerBlock, nonbondOutputBuffers
* allocation of torqueMapForce psCovalentDegree psPolarizationDegree
*
THREADS_PER_BLOCK
*/
struct _amoebaGpuContext {
_gpuContext* gpuContext;
......@@ -112,7 +102,6 @@ struct _amoebaGpuContext {
// multipole parameters
CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
CUDAStream<int>* psMultipoleAxisOffset;
// buffer indices used for mapping torques onto forces
......@@ -133,10 +122,10 @@ struct _amoebaGpuContext {
CUDAStream<float2>* psDampingFactorAndThole;
// slated for removal -- no longer used
// used to setup scaling constants
CUDAStream<int>* psCovalentDegree;
CUDAStream<int>* psPolarizationDegree;
std::vector<int> covalentDegree;
std::vector<int> polarizationDegree;
// fixed-E field
......
......@@ -255,18 +255,22 @@ void kInitializeMutualInducedAndGkField_kernel(
float* inducedDipolePolarS )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms )return;
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
inducedDipole[pos] = fixedEField[pos];
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[pos] = fixedEFieldPolar[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId];
fixedGkField[pos] *= polarizability[pos];
inducedDipoleS[pos] = fixedEField[pos] + fixedGkField[pos];
inducedDipolePolarS[pos] = fixedEFieldPolar[pos] + fixedGkField[pos];
fixedGkField[threadId] *= polarizability[threadId];
inducedDipoleS[threadId] = fixedEField[threadId] + fixedGkField[threadId];
inducedDipolePolarS[threadId] = fixedEFieldPolar[threadId] + fixedGkField[threadId];
pos += blockDim.x*gridDim.x;
}
}
......@@ -355,21 +359,24 @@ void kSorUpdateMutualInducedAndGkField_kernel(
{
float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return;
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
__global__
......@@ -389,21 +396,23 @@ void kSorUpdateMutualInducedAndGkFieldS_kernel(
{
float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*cSim.atoms)return;
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
inducedDipole[threadId] = fixedGkField[threadId] + fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedGkField[threadId] + fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[pos] = fixedGkField[pos] + fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedGkField[pos] + fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
// reduce psWorkArray_3_1 -> outputArray
......@@ -437,46 +446,6 @@ static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
LAUNCHERROR("kReduceMutualInducedAndGkFields4");
}
#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(gpu->sim.paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[ii],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
amoebaGpu->psWorkArray_3_2->_pSysData[ii],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
}
}
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
}
}
#endif
#endif
/**---------------------------------------------------------------------------------------
Compute mutual induce field
......@@ -576,14 +545,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
amoebaGpu->psWorkArray_3_3->Download();
amoebaGpu->psWorkArray_3_4->Download();
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printMiFieldAtomBuffers( amoebaGpu, 100 );
//printMiFieldBuffer( amoebaGpu, 0 );
//printMiFieldBuffer( amoebaGpu, 1 );
//printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 );
if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
......@@ -711,28 +672,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && timestep == 1 ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
kInitializeMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psGk_Field->_pDevData,
......@@ -812,14 +758,14 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
// post matrix multiply
kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
kSorUpdateMutualInducedAndGkField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");
kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>(
kSorUpdateMutualInducedAndGkFieldS_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipoleS->_pDevData, amoebaGpu->psInducedDipolePolarS->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
......@@ -120,14 +120,18 @@ void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return;
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
inducedDipole[pos] = fixedEField[pos];
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[pos] = fixedEFieldPolar[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId];
pos += blockDim.x*gridDim.x;
}
}
......@@ -195,20 +199,24 @@ void kSorUpdateMutualInducedField_kernel(
{
float polarSOR = 0.70f;
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return;
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
......@@ -470,28 +478,13 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
......@@ -555,7 +548,7 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
......@@ -242,14 +242,16 @@ static void kInitializeMutualInducedField_kernel(
float* inducedDipolePolar )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return;
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
inducedDipole[pos] = fixedEField[pos];
fixedEFieldPolar[threadId] *= polarizability[threadId];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId];
fixedEFieldPolar[pos] *= polarizability[pos];
inducedDipolePolar[pos] = fixedEFieldPolar[pos];
pos += blockDim.x*gridDim.x;
}
}
......@@ -325,27 +327,31 @@ static void kSorUpdateMutualInducedField_kernel(
float* matrixProduct, float* matrixProductP )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return;
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
// add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[threadId] += term*previousDipole;
matrixProductP[threadId] += term*previousDipoleP;
matrixProduct[pos] += term*previousDipole;
matrixProductP[pos] += term*previousDipoleP;
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
const float polarSOR = 0.70f;
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
......@@ -539,28 +545,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
......@@ -607,7 +598,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include <stdio.h>
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##ExcludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#define AMOEBA_INCLUDE_DIAGONAL
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset1##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#define AMOEBA_OFFSET_3
#define METHOD_NAME(a, b) a##IncludeDiagonalOffset3##b
#include "kCalculateAmoebaCudaReduce.h"
#undef METHOD_NAME
#undef AMOEBA_OFFSET_3
#undef AMOEBA_INCLUDE_DIAGONAL
void cudaReduceN2ToN( float *N2Array, int Nsz, float *NArray, int includeDiagonal, int offset )
{
int numThreads = min(THREADS_PER_BLOCK, (Nsz));
int numBlocksPerAtom = (Nsz / numThreads);
if( Nsz % numThreads ){
numBlocksPerAtom++;
}
int numBlocks = numBlocksPerAtom*Nsz;
float *partialSum1_d;
// allocate GPU memory
cudaMalloc( (void**) &partialSum1_d, numBlocks*offset*sizeof(float) );
if( includeDiagonal ){
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel1");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel2");
}
} else {
if( offset == 3 ){
kCalculateAmoebaReduceExcludeDiagonalOffset3N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel3");
} else if( offset == 1 ){
kCalculateAmoebaReduceExcludeDiagonalOffset1N2ToNBlockLevel<<< numBlocks, numThreads >>>( N2Array, partialSum1_d, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNBlockLevel4");
}
}
int numBlocks2 = numBlocks;
numBlocks = numBlocks2*Nsz/numThreads;
if( (numBlocks2*Nsz) % numThreads ){
numBlocks++;
}
if( offset == 3 ){
kCalculateAmoebaReduceIncludeDiagonalOffset3N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal3");
} else if( offset == 1 ){
kCalculateAmoebaReduceIncludeDiagonalOffset1N2ToNFinal<<< numBlocks, numThreads >>>(partialSum1_d, NArray, Nsz, numBlocksPerAtom );
LAUNCHERROR("kCalculateAmoebaReduceN2ToNFinal1");
}
//Free memory
cudaFree(partialSum1_d);
}
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
typedef unsigned int uint;
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNBlockLevel)( float *N2Array, float *partialSum, int num,int numberOfBlocksPerAtom )
{
uint tid = threadIdx.x;
__shared__ float asx[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
#ifdef AMOEBA_OFFSET_3
__shared__ float asy[THREADS_PER_BLOCK];
__shared__ float asz[THREADS_PER_BLOCK];
asx[tid] = 0.0f;
asy[tid] = asz[tid] = 0.0f;
int offset = 3;
#else
int offset = 1;
#endif
int atomI = blockIdx.x / numberOfBlocksPerAtom;
int atomJ = (blockIdx.x % numberOfBlocksPerAtom)*blockDim.x+tid;
#ifdef AMOEBA_INCLUDE_DIAGONAL
if( atomJ < num && atomI < num ){
#else
if( atomJ < num && atomJ != atomI ){
#endif
int index = offset*(atomI*num + atomJ);
asx[tid] = N2Array[index];
#ifdef AMOEBA_OFFSET_3
asy[tid] = N2Array[index+1];
asz[tid] = N2Array[index+2];
#endif
}
__syncthreads(); //to make sure all the elements are loaded
for( uint s = (blockDim.x)/2; s != 0; s >>= 1 ){
if( tid < s ){
asx[tid] += asx[tid+s];
#ifdef AMOEBA_OFFSET_3
asy[tid] += asy[tid+s];
asz[tid] += asz[tid+s];
#endif
}
__syncthreads();
}
if( tid == 0 ){
partialSum[blockIdx.x*offset] = asx[0];
#ifdef AMOEBA_OFFSET_3
partialSum[blockIdx.x*3+1] = asy[0];
partialSum[blockIdx.x*3+2] = asz[0];
#endif
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaReduce, N2ToNFinal)( float *partialSum, float *final,int num,int numberOfBlocksPerAtom )
{
uint thread_id = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( thread_id > num )return;
float3 sum;
#ifdef AMOEBA_OFFSET_3
int offset = 3;
sum.x = sum.y = sum.z = 0.0f;
#else
int offset = 1;
sum.x = 0.0f;
#endif
int index = thread_id*offset*numberOfBlocksPerAtom;
for( int i=0; i < numberOfBlocksPerAtom; i++ ){
sum.x += partialSum[index + i*offset];
#ifdef AMOEBA_OFFSET_3
sum.y += partialSum[index + i*offset+1];
sum.z += partialSum[index + i*offset+2];
#endif
}
final[thread_id*offset ] = sum.x;
#ifdef AMOEBA_OFFSET_3
final[thread_id*3+1 ] = sum.y;
final[thread_id*3+2 ] = sum.z;
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment