Commit 09c1460d authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Threads per block for GB/VI kernels take into account required shared memory

Padded atom entries in Born radii memory now initialized to zero
launch bounds guards now included for several kernels
parent dbea5152
......@@ -255,6 +255,29 @@ void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){
}
/**---------------------------------------------------------------------------------------
Get threads/block
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
extern "C"
unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
{
unsigned int grid = freeEnergyGpu->gpuContext->grid;
unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock;
threadsPerBlock *= grid;
return threadsPerBlock;
}
static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){
*x = cellCode >> 17;
*y = (cellCode >> 2 ) & 0x7FFF;
......@@ -275,8 +298,9 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin
//unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0];
(void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u\n",
totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, numWorkUnits );
(void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n",
totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID,
numWorkUnits, gpu->bOutputBufferPerWarp );
unsigned int maxPrint = 3;
std::stringstream message;
......@@ -309,8 +333,13 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin
x *= GRID;
y *= GRID;
unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride;
unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride;
bufferX /= gpu->sim.paddedNumberOfAtoms;
bufferY /= gpu->sim.paddedNumberOfAtoms;
if( jj == 1 ){
(void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] ", ii, jj, warp, pos, end );
//if( bufferX >= 62 || bufferY >= 62 ){
(void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY );
message << buffer;
(void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion );
message << buffer;
......
......@@ -56,5 +56,6 @@ typedef struct _freeEnergyGpuContext *freeEnergyGpuContext;
extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu );
extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu);
extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu);
extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
#endif // __FREE_ENERGY_GPUTYPES_H__
......@@ -28,6 +28,7 @@
#include "kernels/cudatypes.h"
#include "kernels/cudaKernels.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h>
#include <cuda.h>
......@@ -36,6 +37,7 @@
#include <sstream>
#define USE_SOFTCORE_LJ
//#define DEBUG
struct Atom {
float x;
......@@ -104,6 +106,7 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
/**
*
* Calculate Born radii and first GBSA loop forces/energy
......@@ -113,16 +116,29 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
*/
void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu )
{
// printf("kCalculateCDLJObcGbsaForces1\n");
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateCDLJObcGbsaSoftcoreForces1 method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
//fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e blks=%u thread/.block=%u nbMethod==%d warp=%u\n",
// gpu->sim.nonbondedCutoffSqr, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
// freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);
//#define DEBUG
#ifdef DEBUG
fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 blks=%u thread/block=%u %u shMem=%u nbMethod==%d warp=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, sizeof(Atom)*threadsPerBlock,
freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);
int psize = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
......@@ -133,12 +149,12 @@ float atomicRadii;
showWorkUnitsFreeEnergy( freeEnergyGpu, 1 );
for( int ii = 0; ii < psize; ii++ ){
pdE1->_pSysData[ii].x = 0.001f;
pdE1->_pSysData[ii].x = 0.0f;
pdE1->_pSysData[ii].y = 0.001f;
pdE1->_pSysData[ii].z = 0.001f;
pdE1->_pSysData[ii].w = 0.001f;
pdE2->_pSysData[ii].x = 0.001f;
pdE2->_pSysData[ii].x = 0.0f;
pdE2->_pSysData[ii].y = 0.001f;
pdE2->_pSysData[ii].z = 0.001f;
pdE2->_pSysData[ii].w = 0.001f;
......@@ -153,19 +169,23 @@ pdE2->Upload();
// use softcore LJ potential
#ifdef DEBUG
(void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 ver=%u blks=%u threadsPerBlock=%u shMem=%u %u wrp=%u\n", gpu->sm_version,
gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Atom)*threadsPerBlock, gpu->sharedMemoryPerBlock, gpu->bOutputBufferPerWarp ); fflush( stderr );
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData);
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData);
else
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
else
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
#endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1");
......@@ -174,52 +194,102 @@ pdE2->Upload();
case FREE_ENERGY_CUTOFF:
#ifdef DEBUG
(void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreCutoffForces1 %6d blks=%u nonbond_threads_per_block=%5u shMem=%5u\n",
gpu->natoms, gpu->sim.nonbond_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float))*threadsPerBlock);
(void) fflush( stderr );
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
#endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreCutoffForces1");
break;
case FREE_ENERGY_PERIODIC:
#ifdef DEBUG
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1");
break;
}
#ifdef DEBUG
/*
gpu->psBornForce->Download();
freeEnergyGpu->psSwitchDerivative->Download();
gpu->psBornRadii->Download();
fprintf( stderr, "XX bR=%15.7e swd=%15.7e\n", gpu->psBornRadii->_pSysData[0], freeEnergyGpu->psSwitchDerivative->_pSysData[0] );
for( int ii = 0; ii < gpu->sim.nonbondOutputBuffers; ii++ ){
fprintf( stderr, "strx %4d %15.7e %15.7e %15.7e %15.7e\n", ii,
gpu->psBornForce->_pSysStream[ii][0],
gpu->psBornForce->_pSysStream[ii][1],
gpu->psBornForce->_pSysStream[ii][2],
gpu->psBornForce->_pSysStream[ii][3] );
if( gpu->natoms > 1984 ){
int idx = 1983;
fprintf( stderr, "stry %4d %15.7e %15.7e %15.7e %15.7e %5d\n", ii,
gpu->psBornForce->_pSysStream[ii][idx+0],
gpu->psBornForce->_pSysStream[ii][idx+1],
gpu->psBornForce->_pSysStream[ii][idx+2],
gpu->psBornForce->_pSysStream[ii][idx+3], idx );
}
}
int bufferI = 62;
if( bufferI < gpu->sim.nonbondOutputBuffers ){
fprintf( stderr, "BufferI %4d \n", bufferI );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
fprintf( stderr, "strz %4d %15.7e\n", ii, gpu->psBornForce->_pSysStream[bufferI][ii] );
}
}
*/
pdE1->Download();
pdE2->Download();
gpu->psPosq4->Download();
gpu->psGBVIData->Download();
gpu->psBornRadii->Download();
gpu->psBornForce->Download();
freeEnergyGpu->psSwitchDerivative->Download();
fprintf( stderr, "PdeCud %d\n", TARGET );
bF = 0.0;
for( int ii = 0; ii < gpu->natoms; ii++ ){
int count =0;
for( int ii = 0; ii < psize; ii++ ){
bF += pdE1->_pSysData[ii].x;
if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 ){
fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e\n", ii,
if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 && fabs( pdE1->_pSysData[ii].x ) > 0.0 ){
count++;
fprintf( stderr, "%4d %4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e bF=%15.7e swd=%15.7e\n", count, ii,
pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w, gpu->psBornForce->_pSysData[ii],
freeEnergyGpu->psSwitchDerivative->_pSysData[ii] );
}
}
bR = gpu->psBornRadii->_pSysData[TARGET];
......@@ -228,7 +298,7 @@ ratio = (atomicRadii/bR);
bF1 = bF + (3.0f*gpu->psGBVIData->_pSysData[TARGET].z*ratio*ratio*ratio)/bR;
b2 = bR*bR;
bF1 *= (1.0f/3.0f)*b2*b2;
fprintf( stderr, "sumbF Cud %6d %15.7e %15.7e %15.7e\n", TARGET, bF, bF1, bR);
fprintf( stderr, "sumbF Cud %6d count=%d %15.7e %15.7e %15.7e\n", TARGET, count, bF, bF1, bR);
#endif
}
......@@ -37,7 +37,7 @@
#endif
#undef TARGET
//#define TARGET 0
#define TARGET 1926
__global__
#if (__CUDA_ARCH__ >= 200)
......@@ -47,12 +47,18 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
#ifdef DEBUG
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
#else
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
#endif
{
//void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
//unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
//unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
......@@ -166,23 +172,22 @@ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* wo
af.w += dGpol_dalpha2_ij * psA[j].br;
/*
#ifdef DEBUG
int jIdx = j;
if( i == TARGET ){
int tjj = y+jIdx;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].w = 1.0f;
}
if( (y+jIdx) == TARGET ){
int tjj = i;
pdE1[tjj].x = dEdR - Gpol * (1.0f - 0.25f * expTerm);
pdE1[tjj].y = r2;
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].w = -1.0f;
} */
}
#endif
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces
......@@ -248,7 +253,6 @@ pdE1[tjj].w = -1.0f;
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
//float dEdRx = dEdR;
// ObcGbsaForce1 part
......@@ -271,28 +275,23 @@ pdE1[tjj].w = -1.0f;
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0f;
}
/*
#ifdef DEBUG
int jIdx = j;
if( i == TARGET ){
int tjj = (y+jIdx);
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].x = dEdRx;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].w = 2.0f;
}
if( (y+jIdx) == TARGET ){
int tjj = i;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].w = -2.0f;
} */
}
#endif
af.w += dGpol_dalpha2_ij * psA[j].br;
energy += 0.5f*CDLJObcGbsa_energy;
......@@ -361,7 +360,8 @@ pdE1[tjj].w = -2.0f;
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
//else if (flags == 0xFFFFFFFF)
else if (flags)
#endif
{
// Compute all interactions within this block.
......@@ -401,7 +401,6 @@ pdE1[tjj].w = -2.0f;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
//float dEdRx = dEdR;
// ObcGbsaForce1 part
......@@ -423,7 +422,6 @@ pdE1[tjj].w = -2.0f;
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0f;
}
psA[tj].fb += dGpol_dalpha2_ij * br;
af.w += dGpol_dalpha2_ij * psA[tj].br;
......@@ -431,22 +429,21 @@ pdE1[tjj].w = -2.0f;
/*
#ifdef DEBUG
int jIdx = tj;
if( i == TARGET ){
int tjj = y+jIdx;
pdE1[tjj].x = dEdRx;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = -dEdRx*dz;
pdE1[tjj].w = 3.0f;
}
if( (y+jIdx) == TARGET ){
int tjj = i;
pdE1[tjj].x = dEdRx;
pdE1[tjj].x = dGpol_dalpha2_ij * br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz;
pdE1[tjj].w = -3.0f;
} */
}
#endif
......@@ -547,22 +544,21 @@ pdE1[tjj].w = -3.0f;
if (tgx == 0)
psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
/*
#ifdef DEBUG
int jIdx = j;
if( i == TARGET ){
int tjj = y+jIdx;
pdE1[tjj].x = dEdR;
pdE1[tjj].y = r2;
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].w = -4.7f;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].w = 4.0f;
}
if( (y+jIdx) == TARGET ){
int tjj = i;
pdE1[tjj].x = dEdR;
pdE1[tjj].y = r2;
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].w = -4.7f;
} */
pdE1[tjj].x = tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].w = -4.0f;
}
#endif
......@@ -658,7 +654,7 @@ pdE1[tjj].w = -4.7f;
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
//float dEdRx = dEdR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
......@@ -679,25 +675,25 @@ pdE1[tjj].w = -4.7f;
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0;
}
/*
#ifdef DEBUG
int jIdx = tj;
if( i == TARGET ){
int tjj = y+jIdx;
pdE1[tjj].x = dEdRx;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[tj].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz;
pdE1[tjj].z = dGpol_dalpha2_ij;
pdE1[tjj].w = 6.0f;
}
if( (y+jIdx) == TARGET ){
int tjj = i;
pdE1[tjj].x = dEdRx;
pdE1[tjj].x = dGpol_dalpha2_ij * br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz;
pdE1[tjj].z = dGpol_dalpha2_ij;
pdE1[tjj].w = -6.0f;
} */
}
#endif
......
......@@ -67,7 +67,7 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
static const float electricConstant = -166.02691f;
double tau = ((1.0f/innerDielectric)-(1.0f/solventDielectric));
freeEnergyGpu->psSwitchDerivative = new CUDAStream<float>( numberOfParticles, 1, "SwitchDerivative");
freeEnergyGpu->psSwitchDerivative = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "SwitchDerivative");
freeEnergyGpu->freeEnergySim.pSwitchDerivative = freeEnergyGpu->psSwitchDerivative->_pDevData;
// create gpuGBVISoftcore, load parameters, and track minimum softcore value
......@@ -92,6 +92,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psGBVIData)[ii].y = scaledRadii[ii];
(*gpu->psGBVIData)[ii].z = tau*gamma[ii];
(*gpu->psGBVIData)[ii].w = bornRadiusScaleFactors[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
}
// Dummy out extra atom data
......@@ -101,6 +103,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psGBVIData)[ii].y = 0.01f;
(*gpu->psGBVIData)[ii].z = 0.0f;
(*gpu->psGBVIData)[ii].w = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
}
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
......@@ -141,6 +145,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
}
gpu->psGBVIData->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psSwitchDerivative->Upload();
return;
}
......@@ -170,7 +176,15 @@ void kClearGBVISoftcoreBornSum(gpuContext gpu) {
kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, 384>>>();
}
__global__ void kReduceGBVISoftcoreBornForces_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
......@@ -233,7 +247,15 @@ void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )
}
__global__ void kReduceGBVISoftcoreBornSum_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......@@ -362,7 +384,15 @@ void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
}
__global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForcesQuinticScaling_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
......@@ -418,9 +448,12 @@ __global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
{
//printf("kReduceObcGbsaBornForces\n");
gpuContext gpu = freeEnergyGpu->gpuContext;
kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//(void) fprintf( stderr, "kReduceObcGbsaBornForces %6d blks=%u bsf_reduce_threads_per_block=%5u %5u %5u %5u %5u\n",
// gpu->natoms, gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, gpu->sim.bf_reduce_threads_per_block,
// GF1XX_THREADS_PER_BLOCK, GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK); fflush( stderr );
kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
}
......@@ -441,7 +474,7 @@ void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId,
sigEps4->Download();
(void) fprintf( log, "kPrintGBViSoftcore Cuda comp bR bF swd prm sigeps4\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
ii,
gpu->psBornRadii->_pSysData[ii],
......@@ -474,10 +507,28 @@ extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int
void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
{
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreBornSum method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
#ifdef DEBUG
fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
fprintf( stderr, "kCalculateGBVISoftcoreBornSum blks=%u threadsPerBlock=%u %u shMem=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, (sizeof(Atom)+sizeof(float))*threadsPerBlock ); fflush( stderr );
int psize = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
......@@ -511,19 +562,19 @@ pdE2->Upload();
#ifdef DEBUG
if (gpu->bOutputBufferPerWarp){
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
} else {
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
}
#else
if (gpu->bOutputBufferPerWarp){
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
} else {
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
}
#endif
......@@ -536,31 +587,31 @@ pdE2->Upload();
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#ifdef DEBUG
(void) fprintf( stderr, "kCalculateGBVISoftcoreBornSum cutoff=%15.7e warp=%u GridBoundingBox.length=%u interaction_blocks=%u interaction_threads_per_block=%u nonbond_blocks=%u nonbond_threads_per_block=%u\n",
gpu->sim.nonbondedCutoffSqr, gpu->bOutputBufferPerWarp, gpu->psGridBoundingBox->_length, gpu->sim.interaction_blocks,
gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block ); fflush( stderr );
gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, threadsPerBlock ); fflush( stderr );
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
#else
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
break;
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
#endif
break;
case FREE_ENERGY_PERIODIC:
......@@ -569,23 +620,23 @@ pdE2->Upload();
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#ifdef DEBUG
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
else
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
#else
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
#endif
break;
......
......@@ -38,7 +38,7 @@
#include "kCalculateGBVIAux.h"
#undef TARGET
//#define TARGET 5443
//#define TARGET 39
__global__
#if (__CUDA_ARCH__ >= 200)
......
......@@ -29,6 +29,10 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
......@@ -36,10 +40,8 @@
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "GpuFreeEnergyCudaKernels.h"
using namespace std;
struct Atom {
float x;
......@@ -126,8 +128,6 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
}
#define TARGET 0
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
......@@ -163,18 +163,28 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
{
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
/*fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%d bornForce2_threads_per_block=%d\n",
freeEnergyGpu->freeEnergySim.nonbondedMethod,
gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, gpu->psInteractionCount->_pSysData[0] ); fflush( stderr );
*/
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreForces2 method index invalid." );
}
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float3);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
#ifdef DEBUG
fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%u threadsPerBlock=%u shMem=%u\n",
freeEnergyGpu->freeEnergySim.nonbondedMethod,
gpu->sim.bornForce2_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float3))*threadsPerBlock ); fflush( stderr );
int psize = 64;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
......@@ -194,11 +204,11 @@ pdE1->Upload();
pdE2->Upload();
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
else
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
pdE1->Download();
pdE2->Download();
fprintf( stderr, "Pde\n" );
......@@ -210,32 +220,37 @@ fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e
break;
#endif
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
else
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
break;
case FREE_ENERGY_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
case FREE_ENERGY_PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
}
......
......@@ -49,8 +49,11 @@ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit
{
//METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
//unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
//unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
......@@ -191,7 +194,8 @@ if( (x+j) == TARGET ){
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
//else if (flags == 0xFFFFFFFF)
else if (flags)
#endif
{
// Compute all interactions within this block.
......
......@@ -275,6 +275,7 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psObcData)[ii].x = radius[ii] - dielectricOffset;
(*gpu->psObcData)[ii].y = scale[ii] * (*gpu->psObcData)[ii].x;
(*gpu->psPosq4)[ii].w = charge[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
}
......@@ -302,12 +303,14 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psObcData)[ii].x = 0.01f;
(*gpu->psObcData)[ii].y = 0.01f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
}
// load data to board
gpu->psObcData->Upload();
gpu->psPosq4->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psNonPolarScalingFactors->Upload();
return;
......@@ -330,7 +333,7 @@ void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string call
sigEps4->Download();
(void) fprintf( log, "BornSum Born radii & params\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
//(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
ii,
......
......@@ -799,8 +799,8 @@ int main() {
#ifdef USE_SOFTCORE
DoubleVector lamda2;
lamda2.push_back( 1.0 );
lamda2.push_back( 0.5 );
lamda2.push_back( 0.0 );
//lamda2.push_back( 0.5 );
//lamda2.push_back( 0.0 );
if( lamda2.size() > 0 ){
generativeArgumentMaps["lambda2"] = lamda2;
inputArgumentMap["lambda2"] = lamda2[0];
......
......@@ -23,6 +23,7 @@
*/
#include <string.h>
#include <stdio.h>
#include <math.h>
#include <sstream>
#include <vector>
......@@ -534,7 +535,6 @@ RealOpenMM CpuGBVISoftcore::computeBornEnergy( const vector<RealOpenMM>& bornRad
--------------------------------------------------------------------------------------- */
void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, vector<RealVec>& atomCoordinates,
const RealOpenMM* partialCharges, vector<RealVec>& inputForces ){
......@@ -569,6 +569,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
// first main loop
#undef TARGET
#define TARGET -1926
RealOpenMMVector bornForcesT( numberOfAtoms, 0.0 );
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
// partial of polar term wrt Born radius
......@@ -606,6 +610,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
bornForces[atomJ] += dGpol_dalpha2_ij*bornRadii[atomI];
if( atomJ == TARGET ){
bornForcesT[atomI] = dGpol_dalpha2_ij*bornRadii[atomI];
}
deltaX *= dGpol_dr;
deltaY *= dGpol_dr;
deltaZ *= dGpol_dr;
......@@ -620,6 +628,9 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
}
bornForces[atomI] += dGpol_dalpha2_ij*bornRadii[atomJ];
if( atomI == TARGET ){
bornForcesT[atomJ] = dGpol_dalpha2_ij*bornRadii[atomJ];
}
}
}
......@@ -634,6 +645,25 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
RealOpenMMVector& switchDeriviative = getSwitchDeriviative();
const RealOpenMMVector& bornRadiusScaleFactors = gbviParameters->getBornRadiusScaleFactors();
if( 0 ){
double bF = 0.0;
int count = 0;
fprintf( stderr, "PdeRef %d\n", TARGET );
RealOpenMM tau = static_cast<RealOpenMM>(gbviParameters->getTau());
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
bF += bornForcesT[atomI];
if( fabs( bornForcesT[atomI] ) > 0.0 ){
count++;
fprintf( stderr, "%6d %6d bFAdd=%15.7e bR=%15.7e\n", atomI, count, tau*bornForcesT[atomI], bornRadii[atomI] );
}
}
RealOpenMM ratio = (atomicRadii[TARGET]/bornRadii[TARGET]);
RealOpenMM bF1 = bF + (three*gammaParameters[TARGET]*ratio*ratio*ratio)/bornRadii[TARGET];
RealOpenMM b2 = bornRadii[TARGET]*bornRadii[TARGET];
bF1 *= switchDeriviative[TARGET]*oneThird*b2*b2;
fprintf( stderr, "sumbF Ref %6d %15.7e %15.7e %15.7e %15.7e\n", TARGET, bF, tau*bF1, bornRadii[TARGET], switchDeriviative[TARGET] );
}
// ---------------------------------------------------------------------------------------
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
......@@ -708,7 +738,7 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
}
//printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "GBVI softcore post loop2", stderr );
//printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "Reference: GBVI softcore post loop2", stderr );
// apply prefactor tau = (1/diel_solute - 1/diel_solvent)
......@@ -735,7 +765,8 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
--------------------------------------------------------------------------------------- */
void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges,
void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates,
const RealOpenMM* partialCharges,
const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces,
......
......@@ -317,7 +317,7 @@ class CpuGBVISoftcore {
--------------------------------------------------------------------------------------- */
void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges,
void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMM* partialCharges,
const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment