Commit dc677fac authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME.

parent 24148147
...@@ -308,10 +308,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -308,10 +308,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
double my = boxVectors[1][1]/force.getCutoffDistance(); double my = boxVectors[1][1]/force.getCutoffDistance();
double mz = boxVectors[2][2]/force.getCutoffDistance(); double mz = boxVectors[2][2]/force.getCutoffDistance();
double pi = 3.1415926535897932385; double pi = 3.1415926535897932385;
int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
if (force.getNonbondedMethod() == NonbondedForce::Ewald) { if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
if (kmaxx%2 == 0) if (kmaxx%2 == 0)
kmaxx++; kmaxx++;
if (kmaxy%2 == 0) if (kmaxy%2 == 0)
...@@ -322,7 +322,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -322,7 +322,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
method = EWALD; method = EWALD;
} }
else { else {
gpuSetPMEParameters(gpu, (float) alpha); int gridSizeX = kmaxx*3;
int gridSizeY = kmaxy*3;
int gridSizeZ = kmaxz*3;
gridSizeX = ((gridSizeX+3)/4)*4;
gridSizeY = ((gridSizeY+3)/4)*4;
gridSizeZ = ((gridSizeZ+3)/4)*4;
gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
method = PARTICLE_MESH_EWALD; method = PARTICLE_MESH_EWALD;
} }
} }
......
...@@ -358,6 +358,9 @@ struct cudaGmxSimulation { ...@@ -358,6 +358,9 @@ struct cudaGmxSimulation {
float4* pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function float4* pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald) float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
float* pTabulatedErfc; // Tabulated values for erfc()
int tabulatedErfcSize; // The number of tabulated values for erfc()
float tabulatedErfcScale; // Scale factor for the argument to erfc()
int3 pmeGridSize; // The dimensions of the grid for particle mesh Ewald int3 pmeGridSize; // The dimensions of the grid for particle mesh Ewald
int3 pmeGroupSize; // The dimensions of the groups used in charge spreading for PME int3 pmeGroupSize; // The dimensions of the groups used in charge spreading for PME
cufftComplex* pPmeGrid; // Grid points for particle mesh Ewald cufftComplex* pPmeGrid; // Grid points for particle mesh Ewald
...@@ -366,7 +369,6 @@ struct cudaGmxSimulation { ...@@ -366,7 +369,6 @@ struct cudaGmxSimulation {
float4* pPmeBsplineDtheta; float4* pPmeBsplineDtheta;
int4* pPmeParticleIndex; // The grid indices for each atom int4* pPmeParticleIndex; // The grid indices for each atom
float4* pPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions. float4* pPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
int* pPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
int* pPmeAtomRange; // The range of sorted atoms at each grid point int* pPmeAtomRange; // The range of sorted atoms at each grid point
float2* pPmeAtomGridIndex; // The grid point each atom is at float2* pPmeAtomGridIndex; // The grid point each atom is at
unsigned int bonds; // Number of bonds unsigned int bonds; // Number of bonds
......
...@@ -747,6 +747,18 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> ...@@ -747,6 +747,18 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double>
delete fp; delete fp;
} }
static void tabulateErfc(gpuContext gpu)
{
int tableSize = 2048;
gpu->sim.tabulatedErfcSize = tableSize;
gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
for (int i = 0; i < tableSize; ++i)
(*gpu->psTabulatedErfc)[i] = erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
gpu->psTabulatedErfc->Upload();
}
extern "C" extern "C"
void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz) void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
{ {
...@@ -757,13 +769,14 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in ...@@ -757,13 +769,14 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in
gpu->sim.kmaxZ = kmaxz; gpu->sim.kmaxZ = kmaxz;
gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum"); gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum");
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0]; gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
tabulateErfc(gpu);
} }
extern "C" extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha) void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
{ {
gpu->sim.alphaEwald = alpha; gpu->sim.alphaEwald = alpha;
int3 gridSize = make_int3(32, 32, 32); int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
gpu->sim.pmeGridSize = gridSize; gpu->sim.pmeGridSize = gridSize;
int3 groupSize = make_int3(2, 4, 4); int3 groupSize = make_int3(2, 4, 4);
gpu->sim.pmeGroupSize = groupSize; gpu->sim.pmeGroupSize = groupSize;
...@@ -786,12 +799,11 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha) ...@@ -786,12 +799,11 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha)
gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData; gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData;
gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction"); gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction");
gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData; gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
gpu->psPmeInteractionFlags = new CUDAStream<int>(totalGroups*(gpu->sim.paddedNumberOfAtoms/32), 1, "PmeInteractionFlags");
gpu->sim.pPmeInteractionFlags = gpu->psPmeInteractionFlags->_pDevData;
gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange"); gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData; gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex"); gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex");
gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData; gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
tabulateErfc(gpu);
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1684,6 +1696,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1684,6 +1696,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psCustomExceptionID = NULL; gpu->psCustomExceptionID = NULL;
gpu->psCustomExceptionParams = NULL; gpu->psCustomExceptionParams = NULL;
gpu->psEwaldCosSinSum = NULL; gpu->psEwaldCosSinSum = NULL;
gpu->psTabulatedErfc = NULL;
gpu->psPmeGrid = NULL; gpu->psPmeGrid = NULL;
gpu->psPmeBsplineModuli[0] = NULL; gpu->psPmeBsplineModuli[0] = NULL;
gpu->psPmeBsplineModuli[1] = NULL; gpu->psPmeBsplineModuli[1] = NULL;
...@@ -1863,6 +1876,7 @@ void gpuShutDown(gpuContext gpu) ...@@ -1863,6 +1876,7 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psPmeParticleFraction; delete gpu->psPmeParticleFraction;
delete gpu->psPmeAtomRange; delete gpu->psPmeAtomRange;
delete gpu->psPmeAtomGridIndex; delete gpu->psPmeAtomGridIndex;
delete gpu->psTabulatedErfc;
cufftDestroy(gpu->fftplan); cufftDestroy(gpu->fftplan);
} }
delete gpu->psObcData; delete gpu->psObcData;
......
...@@ -106,13 +106,13 @@ struct _gpuContext { ...@@ -106,13 +106,13 @@ struct _gpuContext {
CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
CUDAStream<float2>* psEwaldCosSinSum; CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<float>* psTabulatedErfc; // Tabulated values for erfc()
CUDAStream<cufftComplex>* psPmeGrid; // Grid points for particle mesh Ewald CUDAStream<cufftComplex>* psPmeGrid; // Grid points for particle mesh Ewald
CUDAStream<float>* psPmeBsplineModuli[3]; CUDAStream<float>* psPmeBsplineModuli[3];
CUDAStream<float4>* psPmeBsplineTheta; CUDAStream<float4>* psPmeBsplineTheta;
CUDAStream<float4>* psPmeBsplineDtheta; CUDAStream<float4>* psPmeBsplineDtheta;
CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom
CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions. CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
CUDAStream<int>* psPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point
CUDAStream<float2>* psPmeAtomGridIndex; // The grid point each atom is at CUDAStream<float2>* psPmeAtomGridIndex; // The grid point each atom is at
CUDAStream<float2>* psObcData; CUDAStream<float2>* psObcData;
...@@ -217,7 +217,7 @@ extern "C" ...@@ -217,7 +217,7 @@ extern "C"
void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz); void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
extern "C" extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha); void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
extern "C" extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize); void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
......
...@@ -67,6 +67,17 @@ void GetCalculateCDLJForcesSim(gpuContext gpu) ...@@ -67,6 +67,17 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
__device__ float fastErfc(float r)
{
float normalized = cSim.tabulatedErfcScale*r;
int index = (int) normalized;
float fract2 = normalized-index;
float fract1 = 1.0f-fract2;
return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
}
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b #define METHOD_NAME(a, b) a##N2##b
...@@ -167,27 +178,6 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -167,27 +178,6 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR("kCalculateCDLJPeriodicForces"); LAUNCHERROR("kCalculateCDLJPeriodicForces");
break; break;
case EWALD: case EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldForces");
// Ewald summation
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
break;
case PARTICLE_MESH_EWALD: case PARTICLE_MESH_EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>(); kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic"); LAUNCHERROR("kFindBlockBoundsPeriodic");
...@@ -197,6 +187,8 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -197,6 +187,8 @@ void kCalculateCDLJForces(gpuContext gpu)
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic"); LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
...@@ -204,6 +196,14 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -204,6 +196,14 @@ void kCalculateCDLJForces(gpuContext gpu)
kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldForces"); LAUNCHERROR("kCalculateCDLJEwaldForces");
kCalculatePME(gpu); if (gpu->sim.nonbondedMethod == EWALD)
{
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
}
else
kCalculatePME(gpu);
} }
} }
...@@ -118,7 +118,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -118,7 +118,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI ); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
/* E */ /* E */
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
...@@ -180,7 +180,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -180,7 +180,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
...@@ -303,7 +303,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -303,7 +303,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR; CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
...@@ -371,7 +371,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -371,7 +371,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJ_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else #else
...@@ -474,7 +474,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -474,7 +474,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR; CDLJ_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
......
...@@ -63,6 +63,16 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu) ...@@ -63,6 +63,16 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
__device__ float fastErfc(float r)
{
float normalized = cSim.tabulatedErfcScale*r;
int index = (int) normalized;
float fract2 = normalized-index;
float fract1 = 1.0f-fract2;
return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
}
// Include versions of the kernel for N^2 calculations. // Include versions of the kernel for N^2 calculations.
...@@ -186,6 +196,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -186,6 +196,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break; break;
case EWALD: case EWALD:
case PARTICLE_MESH_EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>(); kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic"); LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
...@@ -199,6 +210,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -199,6 +210,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
kCalculateObcGbsaBornSum(gpu); kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu); kReduceObcGbsaBornSum(gpu);
} }
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
...@@ -206,33 +219,15 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -206,33 +219,15 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces"); LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
// Ewald summation if (gpu->sim.nonbondedMethod == EWALD)
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
break;
case PARTICLE_MESH_EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
if (gpu->bRecalculateBornRadii)
{ {
kCalculateObcGbsaBornSum(gpu); // Ewald summation
kReduceObcGbsaBornSum(gpu); kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
} }
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculatePME(gpu);
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
kCalculatePME(gpu);
} }
} }
...@@ -113,7 +113,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -113,7 +113,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
...@@ -195,7 +195,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -195,7 +195,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
...@@ -350,7 +350,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -350,7 +350,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR; CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
...@@ -439,7 +439,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -439,7 +439,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR; CDLJObcGbsa_energy += apos.w * psA[j].q * invR * erfcAlphaR;
#else #else
...@@ -572,7 +572,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -572,7 +572,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
float erfcAlphaR = erfc(alphaR); float erfcAlphaR = fastErfc(alphaR);
dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI); dEdR += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
/* E */ /* E */
CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR; CDLJObcGbsa_energy += apos.w * psA[tj].q * invR * erfcAlphaR;
......
...@@ -49,6 +49,8 @@ void GetCalculatePMESim(gpuContext gpu) ...@@ -49,6 +49,8 @@ void GetCalculatePMESim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;
inline __host__ __device__ int fast_mod(int a, int b) inline __host__ __device__ int fast_mod(int a, int b)
{ {
return (b & (b - 1)) ? a % b : a & (b - 1); return (b & (b - 1)) ? a % b : a & (b - 1);
...@@ -112,48 +114,6 @@ __global__ void kUpdateGridIndexAndFraction_kernel() ...@@ -112,48 +114,6 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
cSim.pPmeParticleIndex[i] = itmp; cSim.pPmeParticleIndex[i] = itmp;
cSim.pPmeAtomGridIndex[i] = make_float2(i, itmp.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+itmp.y*cSim.pmeGridSize.z+itmp.z); cSim.pPmeAtomGridIndex[i] = make_float2(i, itmp.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+itmp.y*cSim.pmeGridSize.z+itmp.z);
} }
//
// // Compute flags for which atoms affect which groups of grid points.
//
// const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
// const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
// const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
// for (int group = tid; group < totalGroups; group += tnb)
// {
// int3 gridBase;
// gridBase.x = group/(numGroups.y*numGroups.z);
// int remainder = group-gridBase.x*numGroups.y*numGroups.z;
// gridBase.y = remainder/numGroups.z;
// gridBase.z = remainder-gridBase.y*numGroups.z;
// gridBase.x *= cSim.pmeGroupSize.x;
// gridBase.y *= cSim.pmeGroupSize.y;
// gridBase.z *= cSim.pmeGroupSize.z;
// unsigned int flags = 0;
// unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
// for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
// {
// // Decide if this block actually needs to be processed.
//
// int flagIndex = atomBlock%32;
// if (flagIndex == 0)
// flags = 0;
// float4 boxSize = cSim.pGridBoundingBox[atomBlock];
// float4 center = cSim.pGridCenter[atomBlock];
// int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
// int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
// int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
// int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
// int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
// int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
// int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
// int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
// int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
// if (maxx < x || maxy < y || maxz < z)
// flags += 1<<flagIndex;
// if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
// cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
// }
// }
} }
/** /**
...@@ -259,82 +219,6 @@ __global__ void kUpdateBsplines_kernel() ...@@ -259,82 +219,6 @@ __global__ void kUpdateBsplines_kernel()
} }
} }
//__global__ void kGridSpreadCharge_kernel()
//{
// extern __shared__ float atomCharge[];
// int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
// const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
// const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
// const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
// const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
// unsigned int group = warp*totalGroups/totalWarps;
// const unsigned int end = (warp+1)*totalGroups/totalWarps;
// const unsigned int index = threadIdx.x & (GRID - 1);
//
// while (group < end)
// {
// // Process a group of grid points of size cSim.pmeGroupSize. First figure out the base index for the group,
// // and the index of the specific point this thread will handle.
//
// int3 gridBase;
// gridBase.x = group/(numGroups.y*numGroups.z);
// int remainder = group-gridBase.x*numGroups.y*numGroups.z;
// gridBase.y = remainder/numGroups.z;
// gridBase.z = remainder-gridBase.y*numGroups.z;
// gridBase.x *= cSim.pmeGroupSize.x;
// gridBase.y *= cSim.pmeGroupSize.y;
// gridBase.z *= cSim.pmeGroupSize.z;
// int3 gridPoint;
// gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
// remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
// gridPoint.y = remainder/cSim.pmeGroupSize.z;
// gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
// gridPoint.x += gridBase.x;
// gridPoint.y += gridBase.y;
// gridPoint.z += gridBase.z;
//
// // Loop over blocks of atoms.
//
// float result = 0.0f;
// int flags = 0;
// unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
// for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
// {
// // Decide if this block actually needs to be processed.
//
// int flagIndex = atomBlock%32;
// if (flagIndex == 0)
// flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
// if ((flags & (1<<flagIndex)) != 0)
// continue;
// int atomIndex = (atomBlock<<GRIDBITS)+index;
// if (atomIndex < cSim.atoms)
// {
// atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
// atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
// }
// int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
// for (int i = 0; i < maxAtoms; i++)
// {
// int localIndex = threadIdx.x-index+i;
// int atomIndex = (atomBlock<<GRIDBITS)+i;
// int ix = gridPoint.x-atomGridIndex[localIndex].x;
// int iy = gridPoint.y-atomGridIndex[localIndex].y;
// int iz = gridPoint.z-atomGridIndex[localIndex].z;
// ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
// iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
// iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
// if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
// result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
// }
// }
// unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
// if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
// cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
// group++;
// }
//}
__global__ void kGridSpreadCharge_kernel() __global__ void kGridSpreadCharge_kernel()
{ {
unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z; unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
...@@ -365,7 +249,7 @@ __global__ void kGridSpreadCharge_kernel() ...@@ -365,7 +249,7 @@ __global__ void kGridSpreadCharge_kernel()
float2 atomData = cSim.pPmeAtomGridIndex[i]; float2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x; int atomIndex = atomData.x;
float atomCharge = atomData.y; float atomCharge = atomData.y;
result += atomCharge*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z; result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
} }
} }
cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f); cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
...@@ -447,6 +331,8 @@ __global__ void kGridInterpolateForce_kernel() ...@@ -447,6 +331,8 @@ __global__ void kGridInterpolateForce_kernel()
void kCalculatePME(gpuContext gpu) void kCalculatePME(gpuContext gpu)
{ {
// printf("kCalculatePME\n"); // printf("kCalculatePME\n");
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdateGridIndexAndFraction"); LAUNCHERROR("kUpdateGridIndexAndFraction");
bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms); bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
...@@ -455,11 +341,8 @@ void kCalculatePME(gpuContext gpu) ...@@ -455,11 +341,8 @@ void kCalculatePME(gpuContext gpu)
unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4)); unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>(); kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
LAUNCHERROR("kUpdateBsplines"); LAUNCHERROR("kUpdateBsplines");
kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>(); kGridSpreadCharge_kernel<<<8*gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
LAUNCHERROR("kGridSpreadCharge"); LAUNCHERROR("kGridSpreadCharge");
// gpu->psPmeGrid->Download();
// for (int i = 0; i < gpu->psPmeGrid->_length; i += 100)
// printf("%d %f %f\n", i, (*gpu->psPmeGrid)[i].x, (*gpu->psPmeGrid)[i].y);
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD); cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>(); kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kReciprocalConvolution"); LAUNCHERROR("kReciprocalConvolution");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment