Commit fdc7cc07 authored by Kai Kohlhoff's avatar Kai Kohlhoff
Browse files

All energies existing in Preview Release 3 now run on the GPU, new Ewald...

All energies existing in Preview Release 3 now run on the GPU, new Ewald energy terms still use Reference implementation
parent 559cf7c6
...@@ -41,6 +41,7 @@ using namespace std; ...@@ -41,6 +41,7 @@ using namespace std;
static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) { static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) {
_gpuContext* gpu = data.gpu; _gpuContext* gpu = data.gpu;
gpu->bReduceEnergies = false;
if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0) if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
gpuReorderAtoms(gpu); gpuReorderAtoms(gpu);
data.computeForceCount++; data.computeForceCount++;
...@@ -59,22 +60,67 @@ static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) { ...@@ -59,22 +60,67 @@ static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) {
kReduceForces(gpu); kReduceForces(gpu);
} }
static double calcEnergy(ContextImpl& context, System& system) { //static double calcEnergy(ContextImpl& context, System& system) {
// We don't currently have GPU kernels to calculate energy, so instead we have the reference static double calcEnergy(ContextImpl& context, CudaPlatform::PlatformData& data, System& system) {
// platform do it. This is VERY slow.
LangevinIntegrator integrator(0.0, 1.0, 0.0); // New section 2009-09-03: calculate energies and forces, then return reduced energies
ReferencePlatform platform;
Context refContext(system, integrator, platform); _gpuContext* gpu = data.gpu;
const Stream& positions = context.getPositions();
double* posData = new double[positions.getSize()*3]; if (gpu->sim.nonbondedMethod == EWALD)
positions.saveToArray(posData); {
vector<Vec3> pos(positions.getSize()); // We don't currently have GPU kernels to calculate energy, so instead we have the reference
for (int i = 0; i < (int)pos.size(); i++) // platform do it. This is VERY slow.
pos[i] = Vec3(posData[3*i], posData[3*i+1], posData[3*i+2]);
delete[] posData; LangevinIntegrator integrator(0.0, 1.0, 0.0);
refContext.setPositions(pos); ReferencePlatform platform;
return refContext.getState(State::Energy).getPotentialEnergy(); Context refContext(system, integrator, platform);
const Stream& positions = context.getPositions();
double* posData = new double[positions.getSize()*3];
positions.saveToArray(posData);
vector<Vec3> pos(positions.getSize());
for (int i = 0; i < (int)pos.size(); i++)
pos[i] = Vec3(posData[3*i], posData[3*i+1], posData[3*i+2]);
delete[] posData;
refContext.setPositions(pos);
// printf("Total CPU energies: %f\n", refContext.getState(State::Energy).getPotentialEnergy());
return refContext.getState(State::Energy).getPotentialEnergy();
}
else
{
double totalEnergy = 0.0f;
double energy;
gpu->bReduceEnergies = true;
if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
gpuReorderAtoms(gpu);
data.stepCount++;
kClearForces(gpu);
if (gpu->bIncludeGBSA) {
gpu->bRecalculateBornRadii = true;
energy = kCalculateCDLJObcGbsaForces1(gpu);
// printf("Calculated CDLJObcGbsa energy: %f\n", energy);
totalEnergy += energy;
energy = kReduceObcGbsaBornForces(gpu);
// printf("Calculated reduced GbsaBorn surface area energy: %f\n", energy);
totalEnergy += energy;
kCalculateObcGbsaForces2(gpu);
}
else if (data.hasNonbonded) {
energy = kCalculateCDLJForces(gpu);
// printf("Calculated CDLJ energy: %f\n", energy);
totalEnergy += energy;
}
energy = kCalculateLocalForces(gpu);
// printf("Calculated local interactions energy: %f\n", energy);
totalEnergy += energy;
if (gpu->bIncludeGBSA)
kReduceBornSumAndForces(gpu);
else
kReduceForces(gpu);
// printf("Total GPU energies: %f\n", totalEnergy);
return totalEnergy;
}
return 0.0f;
} }
void CudaInitializeForcesKernel::initialize(const System& system) { void CudaInitializeForcesKernel::initialize(const System& system) {
...@@ -122,7 +168,7 @@ void CudaCalcHarmonicBondForceKernel::executeForces(ContextImpl& context) { ...@@ -122,7 +168,7 @@ void CudaCalcHarmonicBondForceKernel::executeForces(ContextImpl& context) {
double CudaCalcHarmonicBondForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcHarmonicBondForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
...@@ -156,7 +202,7 @@ void CudaCalcHarmonicAngleForceKernel::executeForces(ContextImpl& context) { ...@@ -156,7 +202,7 @@ void CudaCalcHarmonicAngleForceKernel::executeForces(ContextImpl& context) {
double CudaCalcHarmonicAngleForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcHarmonicAngleForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
...@@ -192,7 +238,7 @@ void CudaCalcPeriodicTorsionForceKernel::executeForces(ContextImpl& context) { ...@@ -192,7 +238,7 @@ void CudaCalcPeriodicTorsionForceKernel::executeForces(ContextImpl& context) {
double CudaCalcPeriodicTorsionForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcPeriodicTorsionForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
...@@ -234,7 +280,7 @@ void CudaCalcRBTorsionForceKernel::executeForces(ContextImpl& context) { ...@@ -234,7 +280,7 @@ void CudaCalcRBTorsionForceKernel::executeForces(ContextImpl& context) {
double CudaCalcRBTorsionForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcRBTorsionForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
...@@ -350,7 +396,7 @@ void CudaCalcNonbondedForceKernel::executeForces(ContextImpl& context) { ...@@ -350,7 +396,7 @@ void CudaCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
double CudaCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
...@@ -432,7 +478,7 @@ void CudaCalcCustomNonbondedForceKernel::executeForces(ContextImpl& context) { ...@@ -432,7 +478,7 @@ void CudaCalcCustomNonbondedForceKernel::executeForces(ContextImpl& context) {
double CudaCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& context) { double CudaCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& context) {
if (data.primaryKernel == this) if (data.primaryKernel == this)
return calcEnergy(context, system); return calcEnergy(context, data, system);
return 0.0; return 0.0;
} }
......
...@@ -33,12 +33,12 @@ extern void kReduceObcGbsaBornSum(gpuContext gpu); ...@@ -33,12 +33,12 @@ extern void kReduceObcGbsaBornSum(gpuContext gpu);
extern void kGenerateRandoms(gpuContext gpu); extern void kGenerateRandoms(gpuContext gpu);
// Main loop // Main loop
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu); extern double kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu); extern double kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid); extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
extern void kReduceObcGbsaBornForces(gpuContext gpu); extern double kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu); extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu); extern double kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu); extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyFirstShake(gpuContext gpu);
...@@ -58,6 +58,8 @@ extern void kBrownianUpdatePart2(gpuContext gpu); ...@@ -58,6 +58,8 @@ extern void kBrownianUpdatePart2(gpuContext gpu);
// Extras // Extras
extern void kReduceForces(gpuContext gpu); extern void kReduceForces(gpuContext gpu);
extern void kReduceLocalEnergies(gpuContext gpu, float * gpu_energies);
extern double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies);
extern void kClearBornForces(gpuContext gpu); extern void kClearBornForces(gpuContext gpu);
// Initializers // Initializers
......
...@@ -315,10 +315,12 @@ struct cudaGmxSimulation { ...@@ -315,10 +315,12 @@ struct cudaGmxSimulation {
float cellVolume; // Ewald parameter alpha (a.k.a. kappa) float cellVolume; // Ewald parameter alpha (a.k.a. kappa)
float alphaEwald; // Ewald parameter alpha (a.k.a. kappa) float alphaEwald; // Ewald parameter alpha (a.k.a. kappa)
float factorEwald; // - 1 ( 4 * alphaEwald * alphaEwald) float factorEwald; // - 1 ( 4 * alphaEwald * alphaEwald)
float selfEnergyEwald; //
int kmaxX; // Maximum number of reciprocal vectors in the X direction int kmaxX; // Maximum number of reciprocal vectors in the X direction
int kmaxY; // Maximum number of reciprocal vectors in the Y direction int kmaxY; // Maximum number of reciprocal vectors in the Y direction
int kmaxZ; // Maximum number of reciprocal vectors in the Z direction int kmaxZ; // Maximum number of reciprocal vectors in the Z direction
float reactionFieldK; // Constant for reaction field correction float reactionFieldK; // Constant for reaction field correction
float reactionFieldC; // Constant for reaction field correction
float probeRadius; // SASA probe radius float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant float electricConstant; // ACE approximation electric constant
......
...@@ -555,6 +555,7 @@ void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDi ...@@ -555,6 +555,7 @@ void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDi
gpu->sim.nonbondedCutoff = cutoffDistance; gpu->sim.nonbondedCutoff = cutoffDistance;
gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance; gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f); gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
gpu->sim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
} }
extern "C" extern "C"
...@@ -1440,6 +1441,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1440,6 +1441,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->bRemoveCM = false; gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false; gpu->bIncludeGBSA = false;
gpu->bReduceEnergies = true;
gpuInitializeRandoms(gpu); gpuInitializeRandoms(gpu);
// To be determined later // To be determined later
......
...@@ -77,6 +77,7 @@ struct _gpuContext { ...@@ -77,6 +77,7 @@ struct _gpuContext {
bool bRecalculateBornRadii; bool bRecalculateBornRadii;
bool bOutputBufferPerWarp; bool bOutputBufferPerWarp;
bool bIncludeGBSA; bool bIncludeGBSA;
bool bReduceEnergies;
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
CUDPPHandle cudpp; CUDPPHandle cudpp;
......
...@@ -124,19 +124,23 @@ void GetCalculateCDLJForcesSim(gpuContext gpu) ...@@ -124,19 +124,23 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
__global__ extern void kCalculateCDLJCutoffForces_12_kernel(); __global__ extern void kCalculateCDLJCutoffForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu) double kCalculateCDLJForces(gpuContext gpu)
{ {
// printf("kCalculateCDLJCutoffForces\n"); // printf("kCalculateCDLJCutoffForces\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
CUDPPResult result; CUDPPResult result;
switch (gpu->sim.nonbondedMethod) switch (gpu->sim.nonbondedMethod)
{ {
case NO_CUTOFF: case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
else else
kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJN2Forces"); LAUNCHERROR("kCalculateCDLJN2Forces");
break; break;
case CUTOFF: case CUTOFF:
...@@ -155,10 +159,10 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -155,10 +159,10 @@ void kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
else else
kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJCutoffForces"); LAUNCHERROR("kCalculateCDLJCutoffForces");
break; break;
case PERIODIC: case PERIODIC:
...@@ -177,10 +181,10 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -177,10 +181,10 @@ void kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
else else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJPeriodicForces"); LAUNCHERROR("kCalculateCDLJPeriodicForces");
break; break;
case EWALD: case EWALD:
...@@ -199,17 +203,17 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -199,17 +203,17 @@ void kCalculateCDLJForces(gpuContext gpu)
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
else else
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces"); LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
if (false) if (false)
{ {
// O(N2) Ewald summation // O(N2) Ewald summation
kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces"); LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
} }
else else
{ {
// Fast Ewald summation // Fast Ewald summation
...@@ -220,4 +224,19 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -220,4 +224,19 @@ void kCalculateCDLJForces(gpuContext gpu)
} }
} }
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
/* printf("\n");
for (int i = 0; i < N; i++) printf("\t%f", energies[i]);
printf("\n");
*/
}
cudaFree(energies_dev);
return sum;
} }
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, float * gpu_energies)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
...@@ -38,6 +38,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -38,6 +38,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJ_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block]; float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
...@@ -108,24 +111,36 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -108,24 +111,36 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
sig6 = sig2 * sig2 * sig2; sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps; eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ); dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
/* E */
CDLJ_energy += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else #else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif #endif
#else #else
dEdR += apos.w * psA[j].q * invR; dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
} }
#endif #endif
/* E */
energy += CDLJ_energy / 2.0;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -157,16 +172,35 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -157,16 +172,35 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
sig6 = sig2 * sig2 * sig2; sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps; eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ); dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else /* E */
//CDLJ_energy += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
// direct space sum
// CDLJ_energy += 0.5f * apos.w * psA[j].q * erfc(alphaR) * invR;
// reciprocal space sum
CDLJ_energy = 0.5f * apos.w * psA[j].q / PI;
CDLJ_energy *= exp ( - alphaR * alphaR) / SQRT_PI;
// self Ewald Energy
// factor 2 necessary here, because energy is divided by 2 below
// CDLJ_energy += -cSim.epsfac * (psA[i].q * psA[i].q) * cSim.alphaEwald / SQRT_PI;
// if (r != 0) energy = cSim.alphaEwald;
// if (r != 0) energy = apos.w;
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif #endif
#else #else
dEdR += apos.w * psA[j].q * invR; dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -176,7 +210,11 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -176,7 +210,11 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
#endif #endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
} }
/* E */
energy += CDLJ_energy / 2.0;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -255,24 +293,36 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -255,24 +293,36 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
sig6 = sig2 * sig2 * sig2; sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps; eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ); dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else #else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif #endif
#else #else
dEdR += apos.w * psA[tj].q * invR; dEdR += apos.w * psA[tj].q * invR;
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
} }
#endif #endif
/* E */
energy += CDLJ_energy;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -310,24 +360,35 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -310,24 +360,35 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
sig6 = sig2 * sig2 * sig2; sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps; eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ); dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
CDLJ_energy += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else #else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif #endif
#else #else
dEdR += apos.w * psA[j].q * invR; dEdR += apos.w * psA[j].q * invR;
/* E */
CDLJ_energy += apos.w * psA[j].q * invR;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
} }
#endif #endif
/* E */
energy += CDLJ_energy;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -401,16 +462,24 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -401,16 +462,24 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
sig6 = sig2 * sig2 * sig2; sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps; eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
#ifdef USE_EWALD #ifdef USE_EWALD
float r = sqrt(r2); float r = sqrt(r2);
float alphaR = cSim.alphaEwald * r; float alphaR = cSim.alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ); dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else #else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJ_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#endif #endif
#else #else
dEdR += apos.w * psA[tj].q * invR; dEdR += apos.w * psA[tj].q * invR;
/* E */
CDLJ_energy += apos.w * psA[tj].q * invR;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -419,8 +488,12 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -419,8 +488,12 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
if (!(excl & 0x1)) if (!(excl & 0x1))
#endif #endif
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJ_energy = 0.0f;
} }
/* E */
energy += CDLJ_energy;
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
dz *= dEdR; dz *= dEdR;
...@@ -468,4 +541,5 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni ...@@ -468,4 +541,5 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
pos++; pos++;
} }
gpu_energies[energyIndex] = energy;
} }
...@@ -104,10 +104,14 @@ extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel(); ...@@ -104,10 +104,14 @@ extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*); extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*); extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
void kCalculateCDLJObcGbsaForces1(gpuContext gpu) double kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
// printf("kCalculateCDLJObcGbsaForces1\n"); // printf("kCalculateCDLJObcGbsaForces1\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
// check if Born radii need to be calculated // check if Born radii need to be calculated
kClearBornForces(gpu); kClearBornForces(gpu);
...@@ -122,10 +126,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -122,10 +126,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
else else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1"); LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
break; break;
case CUTOFF: case CUTOFF:
...@@ -149,10 +153,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -149,10 +153,10 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
else else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
break; break;
case PERIODIC: case PERIODIC:
...@@ -176,11 +180,24 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu) ...@@ -176,11 +180,24 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
} }
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
else else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, energies_dev);
LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
break; break;
} }
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
float *energies = (float*) malloc(sizeof(float) * N);
// double sum = kReduceNonbondEnergies(gpu, energies_dev);
// printf("GPU____Sum: %f\n", sum);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
// printf("ControlSum: %f\n", sum);
}
cudaFree(energies_dev);
return sum;
} }
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, float * gpu_energies)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
...@@ -38,6 +38,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -38,6 +38,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int energyIndex = blockIdx.x * blockDim.x + threadIdx.x;
float CDLJObcGbsa_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block]; float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif #endif
...@@ -98,10 +101,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -98,10 +101,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps; float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else #else
dEdR += apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
...@@ -115,13 +125,21 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -115,13 +125,21 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
/* E */
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
} /* E */
CDLJObcGbsa_energy = 0.0f;
}
#endif #endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy / 2.0f;
}
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -156,15 +174,24 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -156,15 +174,24 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps; float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else #else
dEdR += apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
if (!(excl & 0x1)) if (!(excl & 0x1))
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
// ObcGbsaForce1 part // ObcGbsaForce1 part
...@@ -177,18 +204,28 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -177,18 +204,28 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
/* E */
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#if defined USE_PERIODIC #if defined USE_PERIODIC
if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr) if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#elif defined USE_CUTOFF #elif defined USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#endif #endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy / 2.0f;
}
// Add Forces // Add Forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -271,10 +308,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -271,10 +308,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps; float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else #else
dEdR += apos.w * psA[tj].q * invR; float factorX = apos.w * psA[tj].q * invR;
dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
...@@ -289,13 +333,21 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -289,13 +333,21 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
af.w += dGpol_dalpha2_ij * psA[tj].br; af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br; psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
/* E */
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#endif #endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy;
}
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -336,10 +388,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -336,10 +388,17 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps; float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else #else
dEdR += apos.w * psA[j].q * invR; float factorX = apos.w * psA[j].q * invR;
dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
...@@ -353,6 +412,8 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -353,6 +412,8 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij); float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
/* E */
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
// Sum the Born forces. // Sum the Born forces.
...@@ -371,9 +432,15 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -371,9 +432,15 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#endif #endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy;
}
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -446,15 +513,24 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -446,15 +513,24 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps; float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6; float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
/* E */
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2); dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
/* E */
CDLJObcGbsa_energy += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
#else #else
dEdR += apos.w * psA[tj].q * invR; float factorX = apos.w * psA[tj].q * invR;
dEdR += factorX;
/* E */
CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
if (!(excl & 0x1)) if (!(excl & 0x1))
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
// ObcGbsaForce1 part // ObcGbsaForce1 part
...@@ -468,18 +544,28 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -468,18 +544,28 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
af.w += dGpol_dalpha2_ij * psA[tj].br; af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br; psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm); dEdR += Gpol * (1.0f - 0.25f * expTerm);
/* E */
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#if defined USE_PERIODIC #if defined USE_PERIODIC
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr) if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#elif defined USE_CUTOFF #elif defined USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
CDLJObcGbsa_energy = 0.0f;
} }
#endif #endif
/* E */
if (i < cSim.atoms)
{
energy += CDLJObcGbsa_energy;
}
// Add forces // Add forces
dx *= dEdR; dx *= dEdR;
dy *= dEdR; dy *= dEdR;
...@@ -529,4 +615,5 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* ...@@ -529,4 +615,5 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
} }
pos++; pos++;
} }
gpu_energies[energyIndex] = energy;
} }
...@@ -63,6 +63,13 @@ static __constant__ cudaGmxSimulation cSim; ...@@ -63,6 +63,13 @@ static __constant__ cudaGmxSimulation cSim;
dEdR = param.y * deltaIdeal; \ dEdR = param.y * deltaIdeal; \
} }
#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \
dEdR = param.y * deltaIdeal * deltaIdeal; \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \ #define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \ { \
float dp; \ float dp; \
...@@ -109,11 +116,14 @@ void GetCalculateLocalForcesSim(gpuContext gpu) ...@@ -109,11 +116,14 @@ void GetCalculateLocalForcesSim(gpuContext gpu)
} }
__global__ void kCalculateLocalForces_kernel() __global__ void kCalculateLocalForces_kernel(float * gpu_energies)
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int energyIndex = pos;
Vectors* A = &sV[threadIdx.x]; Vectors* A = &sV[threadIdx.x];
float energy = 0.0f;
while (pos < cSim.bond_offset) while (pos < cSim.bond_offset)
{ {
if (pos < cSim.bonds) if (pos < cSim.bonds)
...@@ -128,6 +138,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -128,6 +138,7 @@ __global__ void kCalculateLocalForces_kernel()
float r2 = dx * dx + dy * dy + dz * dz; float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2); float r = sqrt(r2);
float deltaIdeal = r - bond.x; float deltaIdeal = r - bond.x;
/* E */ energy += bond.y * deltaIdeal * deltaIdeal / 2.0;
float dEdR = bond.y * deltaIdeal; float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f; dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR); // printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
...@@ -149,7 +160,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -149,7 +160,7 @@ __global__ void kCalculateLocalForces_kernel()
} }
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
while (pos < cSim.bond_angle_offset) while (pos < cSim.bond_angle_offset)
{ {
unsigned int pos1 = pos - cSim.bond_offset; unsigned int pos1 = pos - cSim.bond_offset;
...@@ -174,6 +185,11 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -174,6 +185,11 @@ __global__ void kCalculateLocalForces_kernel()
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2; float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2; float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = dot / sqrt(r21 * r23); float cosine = dot / sqrt(r21 * r23);
float angle_energy;
/* E */ GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
energy += angle_energy / 2.0;
float dEdR; float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR); GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
//printf("%11.4f %11.4f\n", cosine, dEdR); //printf("%11.4f %11.4f\n", cosine, dEdR);
...@@ -211,7 +227,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -211,7 +227,7 @@ __global__ void kCalculateLocalForces_kernel()
} }
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
while (pos < cSim.dihedral_offset) while (pos < cSim.dihedral_offset)
{ {
unsigned int pos1 = pos - cSim.bond_angle_offset; unsigned int pos1 = pos - cSim.bond_angle_offset;
...@@ -236,6 +252,19 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -236,6 +252,19 @@ __global__ void kCalculateLocalForces_kernel()
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle); GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = cSim.pDihedralParameter[pos1]; float4 dihedral = cSim.pDihedralParameter[pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f); float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
// ATTENTION: This section leads to a divergent deltaAngle values wrt
// forces and energies. We separate the case dihedral.z = n = 0, which
// is treated by the calculation of energies via a harmonic potential
/* E */ if (dihedral.z) energy += dihedral.x * (1.0f + cos(deltaAngle));
/* E */ else
{
float deltaAngle = dihedralAngle - dihedral.y;
if (deltaAngle < -M_PI) deltaAngle += 2.0 * M_PI;
else if (deltaAngle > M_PI) deltaAngle -= 2.0 * M_PI;
energy += dihedral.x * deltaAngle * deltaAngle;
}
float sinDeltaAngle = sin(deltaAngle); float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle; float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0); float normCross1 = DOT3(cp0, cp0);
...@@ -293,10 +322,11 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -293,10 +322,11 @@ __global__ void kCalculateLocalForces_kernel()
force.z += -internalF3.z - s.z; force.z += -internalF3.z - s.z;
cSim.pForce4[offset] = force; cSim.pForce4[offset] = force;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]); //printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
} }
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
// Ryckaert Bellemans dihedrals
while (pos < cSim.rb_dihedral_offset) while (pos < cSim.rb_dihedral_offset)
{ {
unsigned int pos1 = pos - cSim.dihedral_offset; unsigned int pos1 = pos - cSim.dihedral_offset;
...@@ -336,17 +366,25 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -336,17 +366,25 @@ __global__ void kCalculateLocalForces_kernel()
float2 dihedral2 = cSim.pRbDihedralParameter2[pos1]; float2 dihedral2 = cSim.pRbDihedralParameter2[pos1];
float cosFactor = cosPhi; float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y; float dEdAngle = -dihedral1.y;
/* E */ float rb_energy = dihedral1.x;
rb_energy += dihedral1.y * cosFactor;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f); // printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor; dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor); // printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi; cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor; dEdAngle -= 3.0f * dihedral1.w * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor); rb_energy += dihedral1.z * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi; cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor; dEdAngle -= 4.0f * dihedral2.x * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor); rb_energy += dihedral1.w * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi; cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor; dEdAngle -= 5.0f * dihedral2.y * cosFactor;
rb_energy += dihedral2.x * cosFactor;
rb_energy += dihedral2.y * cosFactor * cosPhi;
/* E */ energy += rb_energy;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor); // printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sin(dihedralAngle); dEdAngle *= sin(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle); // printf("%4d - f: %9.4f\n", pos1, dEdAngle);
...@@ -430,6 +468,10 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -430,6 +468,10 @@ __global__ void kCalculateLocalForces_kernel()
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6; float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
/* E */
energy = LJ14.x * (sig6 - 1.0f) * sig6;
energy += LJ14.z * inverseR;
dEdR += LJ14.z * inverseR; dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR; dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * cSim.stride; unsigned int offsetA = atom.x + atom.z * cSim.stride;
...@@ -453,6 +495,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -453,6 +495,7 @@ __global__ void kCalculateLocalForces_kernel()
} }
else if (cSim.nonbondedMethod == CUTOFF) else if (cSim.nonbondedMethod == CUTOFF)
{ {
float LJ14_energy;
while (pos < cSim.LJ14_offset) while (pos < cSim.LJ14_offset)
{ {
unsigned int pos1 = pos - cSim.rb_dihedral_offset; unsigned int pos1 = pos - cSim.rb_dihedral_offset;
...@@ -471,13 +514,21 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -471,13 +514,21 @@ __global__ void kCalculateLocalForces_kernel()
float sig2 = inverseR * LJ14.y; float sig2 = inverseR * LJ14.y;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6; float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
/* E */
LJ14_energy = LJ14.x * (sig6 - 1.0f) * sig6;
LJ14_energy += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2); dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR; dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
LJ14_energy = 0.0f;
} }
/* E */
energy += LJ14_energy;
unsigned int offsetA = atom.x + atom.z * cSim.stride; unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride; unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA]; float4 forceA = cSim.pForce4[offsetA];
...@@ -499,6 +550,7 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -499,6 +550,7 @@ __global__ void kCalculateLocalForces_kernel()
} }
else if (cSim.nonbondedMethod == PERIODIC) else if (cSim.nonbondedMethod == PERIODIC)
{ {
float LJ14_energy;
while (pos < cSim.LJ14_offset) while (pos < cSim.LJ14_offset)
{ {
unsigned int pos1 = pos - cSim.rb_dihedral_offset; unsigned int pos1 = pos - cSim.rb_dihedral_offset;
...@@ -521,12 +573,21 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -521,12 +573,21 @@ __global__ void kCalculateLocalForces_kernel()
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2; float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6; float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
/* E */
LJ14_energy = LJ14.x * (sig6 - 1.0f) * sig6;
LJ14_energy += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2); dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR; dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr) if (r2 > cSim.nonbondedCutoffSqr)
{ {
dEdR = 0.0f; dEdR = 0.0f;
/* E */
LJ14_energy = 0.0f;
} }
/* E */
energy += LJ14_energy;
unsigned int offsetA = atom.x + atom.z * cSim.stride; unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride; unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA]; float4 forceA = cSim.pForce4[offsetA];
...@@ -546,13 +607,30 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -546,13 +607,30 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
} }
gpu_energies[energyIndex] = energy;
} }
void kCalculateLocalForces(gpuContext gpu)
double kCalculateLocalForces(gpuContext gpu)
{ {
// printf("kCalculateLocalForces\n"); // printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>(); unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>(energies_dev);
LAUNCHERROR("kCalculateLocalForces"); LAUNCHERROR("kCalculateLocalForces");
double sum = 0.0f;
if (gpu->bReduceEnergies)
{
// sum = kReduceLocalEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
memset(energies, 0, sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
} }
...@@ -254,10 +254,57 @@ void kReduceForces(gpuContext gpu) ...@@ -254,10 +254,57 @@ void kReduceForces(gpuContext gpu)
LAUNCHERROR("kReduceForces"); LAUNCHERROR("kReduceForces");
} }
__global__ void kReduceEnergies_kernel(float * gpu_energies)
{
// GPU energy reduction algorithm goes here
// unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
}
double kReduceLocalEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceLocalEnergies\n");
unsigned int N = gpu->sim.blocks * gpu->sim.localForces_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceLocalEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
double kReduceNonbondEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceNonbondEnergies\n");
unsigned int N = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceNonbondEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
__global__ void kReduceObcGbsaBornForces_kernel() double kReduceObcGbsaBornEnergies(gpuContext gpu, float * gpu_energies)
{
// printf("kReduceObcGbsaBornEnergies\n");
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies = (float*) malloc(sizeof(float) * N);
//kReduceEnergies_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(gpu_energies);
LAUNCHERROR("kReduceObcGbsaBornEnergies");
cudaMemcpy(energies, gpu_energies, sizeof(float) * N, cudaMemcpyDeviceToHost);
double sum = 0.0f;
for (int i = 0; i < N; i++) sum += energies[i];
return sum;
}
__global__ void kReduceObcGbsaBornForces_kernel(float * gpu_energies)
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
unsigned int energyIndex = pos;
float energy = 0.0f;
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
float bornRadius = cSim.pBornRadii[pos]; float bornRadius = cSim.pBornRadii[pos];
...@@ -299,19 +346,40 @@ __global__ void kReduceObcGbsaBornForces_kernel() ...@@ -299,19 +346,40 @@ __global__ void kReduceObcGbsaBornForces_kernel()
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6; float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
/* E */
energy += saTerm;
totalForce *= bornRadius * bornRadius * obcChain; totalForce *= bornRadius * bornRadius * obcChain;
pFt = cSim.pBornForce + pos; pFt = cSim.pBornForce + pos;
*pFt = totalForce; *pFt = totalForce;
pos += gridDim.x * blockDim.x; pos += gridDim.x * blockDim.x;
} }
/* E */
// correct for surface area factor of -6
gpu_energies[energyIndex] = energy / -6.0f;
} }
void kReduceObcGbsaBornForces(gpuContext gpu) double kReduceObcGbsaBornForces(gpuContext gpu)
{ {
unsigned int N = gpu->sim.blocks * gpu->sim.bf_reduce_threads_per_block;
float *energies_dev;
cudaMalloc((void**) &energies_dev, sizeof(float) * N);
//printf("kReduceObcGbsaBornForces\n"); //printf("kReduceObcGbsaBornForces\n");
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(); kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>(energies_dev);
LAUNCHERROR("kReduceObcGbsaBornForces"); LAUNCHERROR("kReduceObcGbsaBornForces");
float sum = 0.0f;
if (gpu->bReduceEnergies)
{
// double sum = kReduceObcGbsaBornEnergies(gpu, energies_dev);
float *energies = (float*) malloc(sizeof(float) * N);
cudaMemcpy(energies, energies_dev, sizeof(float) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) sum += energies[i];
}
cudaFree(energies_dev);
return sum;
} }
...@@ -257,6 +257,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -257,6 +257,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
if( totalEnergy ) if( totalEnergy )
*totalEnergy -= selfEwaldEnergy; *totalEnergy -= selfEwaldEnergy;
// printf("- selfEwaldEnergy: %f aEwald %f p1 %f\n", -selfEwaldEnergy, alphaEwald, atomParameters[atomID][QIndex]);
if( energyByAtom ){ if( energyByAtom ){
energyByAtom[atomID] -= selfEwaldEnergy; energyByAtom[atomID] -= selfEwaldEnergy;
...@@ -266,9 +267,9 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -266,9 +267,9 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// ************************************************************************************** // **************************************************************************************
// RECIPROCAL SPACE EWALD ENERGY AND FORCES // RECIPROCAL SPACE EWALD ENERGY AND FORCES
// ************************************************************************************** // **************************************************************************************
RealOpenMM tempStore = *totalEnergy;
// PME // PME
if (pme) { if (pme) {
pme_t pmedata; /* abstract handle for PME data */ pme_t pmedata; /* abstract handle for PME data */
int ngrid[3]; int ngrid[3];
...@@ -409,6 +410,8 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -409,6 +410,8 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
SimTKOpenMMLog::printError( message ); SimTKOpenMMLog::printError( message );
} }
// printf("recipEnergy: %f\n", *totalEnergy - tempStore);
// ************************************************************************************** // **************************************************************************************
// SHORT-RANGE ENERGY AND FORCES // SHORT-RANGE ENERGY AND FORCES
// ************************************************************************************** // **************************************************************************************
...@@ -448,10 +451,12 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -448,10 +451,12 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// accumulate energies // accumulate energies
realSpaceEwaldEnergy = atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR); realSpaceEwaldEnergy = atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR);
// printf("realSpaceEwald: %f p1 %f p2 %f invR %f erfc(aR) %f aR %f\n", realSpaceEwaldEnergy, atomParameters[ii][QIndex], atomParameters[jj][QIndex], inverseR, erfc(alphaR), alphaR);
vdwEnergy = eps*(sig6-one)*sig6; vdwEnergy = eps*(sig6-one)*sig6;
if( totalEnergy ) if( totalEnergy )
*totalEnergy += realSpaceEwaldEnergy + vdwEnergy; *totalEnergy += realSpaceEwaldEnergy + vdwEnergy;
if( energyByAtom ){ if( energyByAtom ){
energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy; energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
......
...@@ -416,7 +416,7 @@ int CpuObc::computeBornEnergyForces( RealOpenMM* bornRadii, RealOpenMM** atomCoo ...@@ -416,7 +416,7 @@ int CpuObc::computeBornEnergyForces( RealOpenMM* bornRadii, RealOpenMM** atomCoo
if( includeAceApproximation() ){ if( includeAceApproximation() ){
computeAceNonPolarForce( obcParameters, bornRadii, &obcEnergy, bornForces ); computeAceNonPolarForce( obcParameters, bornRadii, &obcEnergy, bornForces );
} }
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
// first main loop // first main loop
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment