Commit 353529b1 authored by Rossen Apostolov's avatar Rossen Apostolov
Browse files

Preliminary implementation of faster (N^3/2) Ewald kernels in Cuda

parent 52c04559
...@@ -335,6 +335,9 @@ struct cudaGmxSimulation { ...@@ -335,6 +335,9 @@ struct cudaGmxSimulation {
float collisionProbability; // Collision probability for Andersen thermostat float collisionProbability; // Collision probability for Andersen thermostat
float2* pObcData; // Pointer to fixed Born data float2* pObcData; // Pointer to fixed Born data
float2* pAttr; // Pointer to additional atom attributes (sig, eps) float2* pAttr; // Pointer to additional atom attributes (sig, eps)
float2* pEikr; // Pointer to exponents of reciprocal vectors and atom coordinates (ewald)
float2* pStructureFactor; // Pointer to the structure factors (ewald)
float2* pCosSinSum; // Pointer to the cos/sin sums (ewald)
unsigned int bonds; // Number of bonds unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters float2* pBondParameter; // Bond parameters
......
...@@ -1087,6 +1087,12 @@ int gpuAllocateInitialBuffers(gpuContext gpu) ...@@ -1087,6 +1087,12 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->sim.pObcChain = gpu->psObcChain->_pDevStream[0]; gpu->sim.pObcChain = gpu->psObcChain->_pDevStream[0];
gpu->psSigEps2 = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2"); gpu->psSigEps2 = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
gpu->sim.pAttr = gpu->psSigEps2->_pDevStream[0]; gpu->sim.pAttr = gpu->psSigEps2->_pDevStream[0];
gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldEikr");
gpu->sim.pEikr = gpu->psEwaldEikr->_pDevStream[0];
gpu->psEwaldStructureFactor = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldStructureFactor");
gpu->sim.pStructureFactor = gpu->psEwaldStructureFactor->_pDevStream[0];
gpu->psEwaldCosSinSum = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldCosSinSum");
gpu->sim.pCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData"); gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0]; gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->pAtomSymbol = new unsigned char[gpu->natoms]; gpu->pAtomSymbol = new unsigned char[gpu->natoms];
...@@ -1486,6 +1492,9 @@ void gpuShutDown(gpuContext gpu) ...@@ -1486,6 +1492,9 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psxVector4; delete gpu->psxVector4;
delete gpu->psvVector4; delete gpu->psvVector4;
delete gpu->psSigEps2; delete gpu->psSigEps2;
delete gpu->psEwaldEikr;
delete gpu->psEwaldStructureFactor;
delete gpu->psEwaldCosSinSum;
delete gpu->psObcData; delete gpu->psObcData;
delete gpu->psObcChain; delete gpu->psObcChain;
delete gpu->psBornForce; delete gpu->psBornForce;
......
...@@ -86,6 +86,9 @@ struct _gpuContext { ...@@ -86,6 +86,9 @@ struct _gpuContext {
CUDAStream<float4>* psxVector4; CUDAStream<float4>* psxVector4;
CUDAStream<float4>* psvVector4; CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2; CUDAStream<float2>* psSigEps2;
CUDAStream<float2>* psEwaldEikr;
CUDAStream<float2>* psEwaldStructureFactor;
CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<float2>* psObcData; CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain; CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce; CUDAStream<float>* psBornForce;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Rossen P. Apostolov, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/* Define multiply operations for floats */
__device__ float2 MultofFloat2(float2 a, float2 b)
{
float2 c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__device__ float2 ConjMultofFloat2(float2 a, float2 b)
{
float2 c;
c.x = a.x*b.x + a.y*b.y;
c.y = a.y*b.x - a.x*b.y;
return c;
}
__device__ float2 FloatMultFloat2(float r, float2 a)
{
float2 b;
b.x = r*a.x;
b.y = r*a.y;
return b;
}
__device__ float2 FloatMultConjFloat2(float r, float2 a)
{
float2 b;
b.x = r*a.y;
b.y = r*a.x;
return b;
}
__global__ void kCalculateEwaldFastEikr_kernel()
{
int kmax = cSim.kmax;
float4 apos;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
//generic form of the array
// pEikr[ atomID*kmax*3 + k*3 + m]
for(unsigned int m = 0; (m < 3); m++) {
// k = 0, explicitly
cSim.pEikr[atom*kmax*3 + 0 + m].x = 1;
cSim.pEikr[atom*kmax*3 + 0 + m].y = 0;
// k = 1, explicitly
cSim.pEikr[atom*kmax*3 + 3 + m].x = cos(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 3 + m].y = sin(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 4 + m].x = cos(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 4 + m].y = sin(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 5 + m].x = cos(apos.z*cSim.recipBoxSizeZ);
cSim.pEikr[atom*kmax*3 + 5 + m].y = sin(apos.z*cSim.recipBoxSizeZ);
}
// k > 1, by recursion
for(unsigned int k=2; (k<kmax); k++) {
for(unsigned int m=0; (m<3); m++) {
cSim.pEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEikr[atom*kmax*3 + 3 + m]);
}
}
atom += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastCosSinSums_kernel()
{
// hard-coded maximum k-vectors, no interface yet
int kmax = cSim.kmax;
// float2 eikr;
float4 apos;
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
float2 tab_xy;
int index;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
apos = cSim.pPosq[atom];
// **********************************************************************
// cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++) {
for(int ry = lowry; ry < numRy; ry++) {
if(ry >= 0) {
// tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
}
else {
// tab_xy[n] = EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
}
for (int rz = lowrz; rz < numRz; rz++) {
// next one is scary!
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
if( rz >= 0) {
//tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
}
else {
// tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
}
cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
// **********************************************************************
atom += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastForces_kernel()
{
float PI = 3.14159265358979323846f;
// hard-coded maximum k-vectors, no interface yet
// int kmax = cSim.kmax;
const float epsilon = 1.0;
float recipCoeff = (4*PI/cSim.V/epsilon);
// float2 eikr;
// float4 apos;
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
int index;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
// apos = cSim.pPosq[atom];
// **********************************************************************
// cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++) {
float kx = rx * cSim.recipBoxSizeX;
for(int ry = lowry; ry < numRy; ry++) {
float ky = ry * cSim.recipBoxSizeY;
for (int rz = lowrz; rz < numRz; rz++) {
float kz = rz * cSim.recipBoxSizeZ;
// next one is scary!
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
float k2 = kx * kx + ky * ky + kz * kz;
float ak = exp(k2*cSim.factorEwald) / k2;
float dEdR = ak * (cSim.pCosSinSum[index].x * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x);
cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ;
cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ;
cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ;
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
// **********************************************************************
atom += blockDim.x * gridDim.x;
}
}
...@@ -32,7 +32,6 @@ ...@@ -32,7 +32,6 @@
__global__ void kCalculateCDLJEwaldReciprocalForces_kernel() __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
{ {
float alphaEwald = cSim.alphaEwald;
float eps0 = 5.72765E-4; float eps0 = 5.72765E-4;
int numRx = 20+1; int numRx = 20+1;
...@@ -46,13 +45,15 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel() ...@@ -46,13 +45,15 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
float V = cSim.cellVolume; float V = cSim.cellVolume;
float4 apos1, apos2 ; float4 apos1, apos2 ; // i nteracting atoms
float4 af; // atomic force
unsigned int atomID1 = threadIdx.x + blockIdx.x * blockDim.x; unsigned int atomID1 = threadIdx.x + blockIdx.x * blockDim.x;
while (atomID1 < cSim.stride * cSim.outputBuffers) while (atomID1 < cSim.atoms)
{ {
apos1 = cSim.pPosq[atomID1]; apos1 = cSim.pPosq[atomID1];
af = cSim.pForce4[atomID1];
unsigned int atomID2 = 0; unsigned int atomID2 = 0;
while (atomID2 < cSim.atoms) while (atomID2 < cSim.atoms)
...@@ -85,9 +86,9 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel() ...@@ -85,9 +86,9 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z ); CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z ); CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
cSim.pForce4[atomID1].x -= (2.0 / (V * eps0 )) * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); af.x -= (2.0 / (V * eps0 )) * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
cSim.pForce4[atomID1].y -= (2.0 / (V * eps0 )) * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); af.y -= (2.0 / (V * eps0 )) * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
cSim.pForce4[atomID1].z -= (2.0 / (V * eps0 )) * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); af.z -= (2.0 / (V * eps0 )) * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
lowrz = 1 - numRz; lowrz = 1 - numRz;
} }
...@@ -98,7 +99,8 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel() ...@@ -98,7 +99,8 @@ __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
atomID2++; atomID2++;
} }
cSim.pForce4[atomID1] = af;
atomID1 += blockDim.x * gridDim.x; atomID1 += blockDim.x * gridDim.x;
} }
......
...@@ -119,6 +119,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu) ...@@ -119,6 +119,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
// Reciprocal Space Ewald summation is in a separate kernel // Reciprocal Space Ewald summation is in a separate kernel
#include "kCalculateCDLJEwaldReciprocal.h" #include "kCalculateCDLJEwaldReciprocal.h"
#include "kCalculateCDLJEwaldFastReciprocal.h"
__global__ extern void kCalculateCDLJCutoffForces_12_kernel(); __global__ extern void kCalculateCDLJCutoffForces_12_kernel();
...@@ -198,6 +199,7 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -198,6 +199,7 @@ void kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
{ {
// Vanilla (N2) Ewald
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
...@@ -205,11 +207,20 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -205,11 +207,20 @@ void kCalculateCDLJForces(gpuContext gpu)
} }
else else
{ {
// Vanilla (N2) Ewald
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces"); LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces"); LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
// If using Fast Ewald, uncomment the lines below
// kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastEikr");
// kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastCosSinSums");
// kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastForces");
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment