Commit 09c1460d authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Threads per block for GB/VI kernels take into account required shared memory

Padded atom entries in Born radii memory now initialized to zero
launch bounds guards now included for several kernels
parent dbea5152
...@@ -255,6 +255,29 @@ void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){ ...@@ -255,6 +255,29 @@ void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){
} }
/**---------------------------------------------------------------------------------------
Get threads/block
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
extern "C"
unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
{
unsigned int grid = freeEnergyGpu->gpuContext->grid;
unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock;
threadsPerBlock *= grid;
return threadsPerBlock;
}
static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){ static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){
*x = cellCode >> 17; *x = cellCode >> 17;
*y = (cellCode >> 2 ) & 0x7FFF; *y = (cellCode >> 2 ) & 0x7FFF;
...@@ -275,8 +298,9 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin ...@@ -275,8 +298,9 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin
//unsigned int numWorkUnits = cSim.pInteractionCount[0]; //unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0]; unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0];
(void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u\n", (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n",
totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, numWorkUnits ); totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID,
numWorkUnits, gpu->bOutputBufferPerWarp );
unsigned int maxPrint = 3; unsigned int maxPrint = 3;
std::stringstream message; std::stringstream message;
...@@ -309,8 +333,13 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin ...@@ -309,8 +333,13 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin
x *= GRID; x *= GRID;
y *= GRID; y *= GRID;
unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride;
unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride;
bufferX /= gpu->sim.paddedNumberOfAtoms;
bufferY /= gpu->sim.paddedNumberOfAtoms;
if( jj == 1 ){ if( jj == 1 ){
(void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] ", ii, jj, warp, pos, end ); //if( bufferX >= 62 || bufferY >= 62 ){
(void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY );
message << buffer; message << buffer;
(void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion ); (void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion );
message << buffer; message << buffer;
......
...@@ -56,5 +56,6 @@ typedef struct _freeEnergyGpuContext *freeEnergyGpuContext; ...@@ -56,5 +56,6 @@ typedef struct _freeEnergyGpuContext *freeEnergyGpuContext;
extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu ); extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu );
extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu); extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu);
extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu); extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu);
extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
#endif // __FREE_ENERGY_GPUTYPES_H__ #endif // __FREE_ENERGY_GPUTYPES_H__
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "kernels/cudatypes.h" #include "kernels/cudatypes.h"
#include "kernels/cudaKernels.h" #include "kernels/cudaKernels.h"
#include "GpuFreeEnergyCudaKernels.h" #include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h> #include <stdio.h>
#include <cuda.h> #include <cuda.h>
...@@ -36,6 +37,7 @@ ...@@ -36,6 +37,7 @@
#include <sstream> #include <sstream>
#define USE_SOFTCORE_LJ #define USE_SOFTCORE_LJ
//#define DEBUG
struct Atom { struct Atom {
float x; float x;
...@@ -104,6 +106,7 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu ...@@ -104,6 +106,7 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
#define METHOD_NAME(a, b) a##PeriodicByWarp##b #define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h" #include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
/** /**
* *
* Calculate Born radii and first GBSA loop forces/energy * Calculate Born radii and first GBSA loop forces/energy
...@@ -113,16 +116,29 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu ...@@ -113,16 +116,29 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
*/ */
void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu ) void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu )
{ {
// printf("kCalculateCDLJObcGbsaForces1\n"); unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext; gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateCDLJObcGbsaSoftcoreForces1 method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
//fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e blks=%u thread/.block=%u nbMethod==%d warp=%u\n",
// gpu->sim.nonbondedCutoffSqr, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
// freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);
//#define DEBUG
#ifdef DEBUG #ifdef DEBUG
fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr ); fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 blks=%u thread/block=%u %u shMem=%u nbMethod==%d warp=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, sizeof(Atom)*threadsPerBlock,
freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);
int psize = gpu->sim.paddedNumberOfAtoms; int psize = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
...@@ -133,12 +149,12 @@ float atomicRadii; ...@@ -133,12 +149,12 @@ float atomicRadii;
showWorkUnitsFreeEnergy( freeEnergyGpu, 1 ); showWorkUnitsFreeEnergy( freeEnergyGpu, 1 );
for( int ii = 0; ii < psize; ii++ ){ for( int ii = 0; ii < psize; ii++ ){
pdE1->_pSysData[ii].x = 0.001f; pdE1->_pSysData[ii].x = 0.0f;
pdE1->_pSysData[ii].y = 0.001f; pdE1->_pSysData[ii].y = 0.001f;
pdE1->_pSysData[ii].z = 0.001f; pdE1->_pSysData[ii].z = 0.001f;
pdE1->_pSysData[ii].w = 0.001f; pdE1->_pSysData[ii].w = 0.001f;
pdE2->_pSysData[ii].x = 0.001f; pdE2->_pSysData[ii].x = 0.0f;
pdE2->_pSysData[ii].y = 0.001f; pdE2->_pSysData[ii].y = 0.001f;
pdE2->_pSysData[ii].z = 0.001f; pdE2->_pSysData[ii].z = 0.001f;
pdE2->_pSysData[ii].w = 0.001f; pdE2->_pSysData[ii].w = 0.001f;
...@@ -153,19 +169,23 @@ pdE2->Upload(); ...@@ -153,19 +169,23 @@ pdE2->Upload();
// use softcore LJ potential // use softcore LJ potential
#ifdef DEBUG #ifdef DEBUG
(void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 ver=%u blks=%u threadsPerBlock=%u shMem=%u %u wrp=%u\n", gpu->sm_version,
gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Atom)*threadsPerBlock, gpu->sharedMemoryPerBlock, gpu->bOutputBufferPerWarp ); fflush( stderr );
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData);
else else
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else #else
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit ); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
else else
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit ); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
#endif #endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1");
...@@ -174,52 +194,102 @@ pdE2->Upload(); ...@@ -174,52 +194,102 @@ pdE2->Upload();
case FREE_ENERGY_CUTOFF: case FREE_ENERGY_CUTOFF:
#ifdef DEBUG #ifdef DEBUG
(void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreCutoffForces1 %6d blks=%u nonbond_threads_per_block=%5u shMem=%5u\n",
gpu->natoms, gpu->sim.nonbond_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float))*threadsPerBlock);
(void) fflush( stderr );
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else else
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else #else
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
#endif #endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreCutoffForces1");
break; break;
case FREE_ENERGY_PERIODIC: case FREE_ENERGY_PERIODIC:
#ifdef DEBUG
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else else
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
#else
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#endif
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1"); LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1");
break; break;
} }
#ifdef DEBUG #ifdef DEBUG
/*
gpu->psBornForce->Download();
freeEnergyGpu->psSwitchDerivative->Download();
gpu->psBornRadii->Download();
fprintf( stderr, "XX bR=%15.7e swd=%15.7e\n", gpu->psBornRadii->_pSysData[0], freeEnergyGpu->psSwitchDerivative->_pSysData[0] );
for( int ii = 0; ii < gpu->sim.nonbondOutputBuffers; ii++ ){
fprintf( stderr, "strx %4d %15.7e %15.7e %15.7e %15.7e\n", ii,
gpu->psBornForce->_pSysStream[ii][0],
gpu->psBornForce->_pSysStream[ii][1],
gpu->psBornForce->_pSysStream[ii][2],
gpu->psBornForce->_pSysStream[ii][3] );
if( gpu->natoms > 1984 ){
int idx = 1983;
fprintf( stderr, "stry %4d %15.7e %15.7e %15.7e %15.7e %5d\n", ii,
gpu->psBornForce->_pSysStream[ii][idx+0],
gpu->psBornForce->_pSysStream[ii][idx+1],
gpu->psBornForce->_pSysStream[ii][idx+2],
gpu->psBornForce->_pSysStream[ii][idx+3], idx );
}
}
int bufferI = 62;
if( bufferI < gpu->sim.nonbondOutputBuffers ){
fprintf( stderr, "BufferI %4d \n", bufferI );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
fprintf( stderr, "strz %4d %15.7e\n", ii, gpu->psBornForce->_pSysStream[bufferI][ii] );
}
}
*/
pdE1->Download(); pdE1->Download();
pdE2->Download(); pdE2->Download();
gpu->psPosq4->Download(); gpu->psPosq4->Download();
gpu->psGBVIData->Download(); gpu->psGBVIData->Download();
gpu->psBornRadii->Download(); gpu->psBornRadii->Download();
gpu->psBornForce->Download();
freeEnergyGpu->psSwitchDerivative->Download(); freeEnergyGpu->psSwitchDerivative->Download();
fprintf( stderr, "PdeCud %d\n", TARGET ); fprintf( stderr, "PdeCud %d\n", TARGET );
bF = 0.0; bF = 0.0;
for( int ii = 0; ii < gpu->natoms; ii++ ){ int count =0;
for( int ii = 0; ii < psize; ii++ ){
bF += pdE1->_pSysData[ii].x; bF += pdE1->_pSysData[ii].x;
if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 ){ if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 && fabs( pdE1->_pSysData[ii].x ) > 0.0 ){
fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e\n", ii, count++;
fprintf( stderr, "%4d %4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e bF=%15.7e swd=%15.7e\n", count, ii,
pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w, pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w ); pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w, gpu->psBornForce->_pSysData[ii],
freeEnergyGpu->psSwitchDerivative->_pSysData[ii] );
} }
} }
bR = gpu->psBornRadii->_pSysData[TARGET]; bR = gpu->psBornRadii->_pSysData[TARGET];
...@@ -228,7 +298,7 @@ ratio = (atomicRadii/bR); ...@@ -228,7 +298,7 @@ ratio = (atomicRadii/bR);
bF1 = bF + (3.0f*gpu->psGBVIData->_pSysData[TARGET].z*ratio*ratio*ratio)/bR; bF1 = bF + (3.0f*gpu->psGBVIData->_pSysData[TARGET].z*ratio*ratio*ratio)/bR;
b2 = bR*bR; b2 = bR*bR;
bF1 *= (1.0f/3.0f)*b2*b2; bF1 *= (1.0f/3.0f)*b2*b2;
fprintf( stderr, "sumbF Cud %6d %15.7e %15.7e %15.7e\n", TARGET, bF, bF1, bR); fprintf( stderr, "sumbF Cud %6d count=%d %15.7e %15.7e %15.7e\n", TARGET, count, bF, bF1, bR);
#endif #endif
} }
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
#endif #endif
#undef TARGET #undef TARGET
//#define TARGET 0 #define TARGET 1926
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
...@@ -47,12 +47,18 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -47,12 +47,18 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
#ifdef DEBUG
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
#else
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit ) void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
#endif
{ {
//void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
//unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
//unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
...@@ -166,23 +172,22 @@ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* wo ...@@ -166,23 +172,22 @@ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* wo
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
/* #ifdef DEBUG
int jIdx = j; int jIdx = j;
if( i == TARGET ){ if( i == TARGET ){
int tjj = y+jIdx; int tjj = y+jIdx;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br; pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].w = 1.0f; pdE1[tjj].w = 1.0f;
} }
if( (y+jIdx) == TARGET ){ if( (y+jIdx) == TARGET ){
int tjj = i; int tjj = i;
pdE1[tjj].x = dEdR - Gpol * (1.0f - 0.25f * expTerm); pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = r2; pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].w = -1.0f; pdE1[tjj].w = -1.0f;
} */ }
#endif
energy += 0.5f*CDLJObcGbsa_energy; energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces // Add Forces
...@@ -248,7 +253,6 @@ pdE1[tjj].w = -1.0f; ...@@ -248,7 +253,6 @@ pdE1[tjj].w = -1.0f;
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
//float dEdRx = dEdR;
// ObcGbsaForce1 part // ObcGbsaForce1 part
...@@ -271,28 +275,23 @@ pdE1[tjj].w = -1.0f; ...@@ -271,28 +275,23 @@ pdE1[tjj].w = -1.0f;
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0f;
} }
/* #ifdef DEBUG
int jIdx = j; int jIdx = j;
if( i == TARGET ){ if( i == TARGET ){
int tjj = (y+jIdx); int tjj = (y+jIdx);
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br; pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].x = dEdRx;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
pdE1[tjj].w = 2.0f; pdE1[tjj].w = 2.0f;
} }
if( (y+jIdx) == TARGET ){ if( (y+jIdx) == TARGET ){
int tjj = i; int tjj = i;
pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br; pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = (q2 * psA[j].q) / denominator;
pdE1[tjj].w = -2.0f; pdE1[tjj].w = -2.0f;
} */ }
#endif
af.w += dGpol_dalpha2_ij * psA[j].br; af.w += dGpol_dalpha2_ij * psA[j].br;
energy += 0.5f*CDLJObcGbsa_energy; energy += 0.5f*CDLJObcGbsa_energy;
...@@ -361,7 +360,8 @@ pdE1[tjj].w = -2.0f; ...@@ -361,7 +360,8 @@ pdE1[tjj].w = -2.0f;
{ {
// No interactions in this block. // No interactions in this block.
} }
else if (flags == 0xFFFFFFFF) //else if (flags == 0xFFFFFFFF)
else if (flags)
#endif #endif
{ {
// Compute all interactions within this block. // Compute all interactions within this block.
...@@ -401,7 +401,6 @@ pdE1[tjj].w = -2.0f; ...@@ -401,7 +401,6 @@ pdE1[tjj].w = -2.0f;
CDLJObcGbsa_energy += factorX; CDLJObcGbsa_energy += factorX;
#endif #endif
dEdR *= invR * invR; dEdR *= invR * invR;
//float dEdRx = dEdR;
// ObcGbsaForce1 part // ObcGbsaForce1 part
...@@ -423,7 +422,6 @@ pdE1[tjj].w = -2.0f; ...@@ -423,7 +422,6 @@ pdE1[tjj].w = -2.0f;
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0f;
} }
psA[tj].fb += dGpol_dalpha2_ij * br; psA[tj].fb += dGpol_dalpha2_ij * br;
af.w += dGpol_dalpha2_ij * psA[tj].br; af.w += dGpol_dalpha2_ij * psA[tj].br;
...@@ -431,22 +429,21 @@ pdE1[tjj].w = -2.0f; ...@@ -431,22 +429,21 @@ pdE1[tjj].w = -2.0f;
/* #ifdef DEBUG
int jIdx = tj; int jIdx = tj;
if( i == TARGET ){ if( i == TARGET ){
int tjj = y+jIdx; int tjj = y+jIdx;
pdE1[tjj].x = dEdRx; pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = -dEdRx*dz;
pdE1[tjj].w = 3.0f; pdE1[tjj].w = 3.0f;
} }
if( (y+jIdx) == TARGET ){ if( (y+jIdx) == TARGET ){
int tjj = i; int tjj = i;
pdE1[tjj].x = dEdRx; pdE1[tjj].x = dGpol_dalpha2_ij * br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz;
pdE1[tjj].w = -3.0f; pdE1[tjj].w = -3.0f;
} */ }
#endif
...@@ -547,22 +544,21 @@ pdE1[tjj].w = -3.0f; ...@@ -547,22 +544,21 @@ pdE1[tjj].w = -3.0f;
if (tgx == 0) if (tgx == 0)
psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16]; psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
/* #ifdef DEBUG
int jIdx = j; int jIdx = j;
if( i == TARGET ){ if( i == TARGET ){
int tjj = y+jIdx; int tjj = y+jIdx;
pdE1[tjj].x = dEdR; pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
pdE1[tjj].y = r2; pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator; pdE1[tjj].w = 4.0f;
pdE1[tjj].w = -4.7f;
} }
if( (y+jIdx) == TARGET ){ if( (y+jIdx) == TARGET ){
int tjj = i; int tjj = i;
pdE1[tjj].x = dEdR; pdE1[tjj].x = tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
pdE1[tjj].y = r2; pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator; pdE1[tjj].w = -4.0f;
pdE1[tjj].w = -4.7f; }
} */ #endif
...@@ -658,7 +654,7 @@ pdE1[tjj].w = -4.7f; ...@@ -658,7 +654,7 @@ pdE1[tjj].w = -4.7f;
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
} }
//float dEdRx = dEdR;
// ObcGbsaForce1 part // ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br; float alpha2_ij = br * psA[tj].br;
...@@ -679,25 +675,25 @@ pdE1[tjj].w = -4.7f; ...@@ -679,25 +675,25 @@ pdE1[tjj].w = -4.7f;
dEdR = 0.0f; dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f; CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
//dEdRx = 0.0;
} }
/* #ifdef DEBUG
int jIdx = tj; int jIdx = tj;
if( i == TARGET ){ if( i == TARGET ){
int tjj = y+jIdx; int tjj = y+jIdx;
pdE1[tjj].x = dEdRx; pdE1[tjj].x = dGpol_dalpha2_ij * psA[tj].br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz; pdE1[tjj].z = dGpol_dalpha2_ij;
pdE1[tjj].w = 6.0f; pdE1[tjj].w = 6.0f;
} }
if( (y+jIdx) == TARGET ){ if( (y+jIdx) == TARGET ){
int tjj = i; int tjj = i;
pdE1[tjj].x = dEdRx; pdE1[tjj].x = dGpol_dalpha2_ij * br;
pdE1[tjj].y = sqrt(r2); pdE1[tjj].y = sqrt(r2);
pdE1[tjj].z = dEdRx*dz; pdE1[tjj].z = dGpol_dalpha2_ij;
pdE1[tjj].w = -6.0f; pdE1[tjj].w = -6.0f;
} */ }
#endif
......
...@@ -67,7 +67,7 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -67,7 +67,7 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
static const float electricConstant = -166.02691f; static const float electricConstant = -166.02691f;
double tau = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); double tau = ((1.0f/innerDielectric)-(1.0f/solventDielectric));
freeEnergyGpu->psSwitchDerivative = new CUDAStream<float>( numberOfParticles, 1, "SwitchDerivative"); freeEnergyGpu->psSwitchDerivative = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "SwitchDerivative");
freeEnergyGpu->freeEnergySim.pSwitchDerivative = freeEnergyGpu->psSwitchDerivative->_pDevData; freeEnergyGpu->freeEnergySim.pSwitchDerivative = freeEnergyGpu->psSwitchDerivative->_pDevData;
// create gpuGBVISoftcore, load parameters, and track minimum softcore value // create gpuGBVISoftcore, load parameters, and track minimum softcore value
...@@ -92,6 +92,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -92,6 +92,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psGBVIData)[ii].y = scaledRadii[ii]; (*gpu->psGBVIData)[ii].y = scaledRadii[ii];
(*gpu->psGBVIData)[ii].z = tau*gamma[ii]; (*gpu->psGBVIData)[ii].z = tau*gamma[ii];
(*gpu->psGBVIData)[ii].w = bornRadiusScaleFactors[ii]; (*gpu->psGBVIData)[ii].w = bornRadiusScaleFactors[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
} }
// Dummy out extra atom data // Dummy out extra atom data
...@@ -101,6 +103,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -101,6 +103,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psGBVIData)[ii].y = 0.01f; (*gpu->psGBVIData)[ii].y = 0.01f;
(*gpu->psGBVIData)[ii].z = 0.0f; (*gpu->psGBVIData)[ii].z = 0.0f;
(*gpu->psGBVIData)[ii].w = 0.0f; (*gpu->psGBVIData)[ii].w = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
} }
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor; gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
...@@ -141,6 +145,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -141,6 +145,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
} }
gpu->psGBVIData->Upload(); gpu->psGBVIData->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psSwitchDerivative->Upload();
return; return;
} }
...@@ -170,7 +176,15 @@ void kClearGBVISoftcoreBornSum(gpuContext gpu) { ...@@ -170,7 +176,15 @@ void kClearGBVISoftcoreBornSum(gpuContext gpu) {
kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, 384>>>(); kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, 384>>>();
} }
__global__ void kReduceGBVISoftcoreBornForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
...@@ -233,7 +247,15 @@ void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu ) ...@@ -233,7 +247,15 @@ void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )
} }
__global__ void kReduceGBVISoftcoreBornSum_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornSum_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -362,7 +384,15 @@ void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu ) ...@@ -362,7 +384,15 @@ void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel"); LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
} }
__global__ void kReduceGBVIBornForcesQuinticScaling_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForcesQuinticScaling_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
...@@ -418,9 +448,12 @@ __global__ void kReduceGBVIBornForcesQuinticScaling_kernel() ...@@ -418,9 +448,12 @@ __global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu ) void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
{ {
//printf("kReduceObcGbsaBornForces\n");
gpuContext gpu = freeEnergyGpu->gpuContext; gpuContext gpu = freeEnergyGpu->gpuContext;
kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//(void) fprintf( stderr, "kReduceObcGbsaBornForces %6d blks=%u bsf_reduce_threads_per_block=%5u %5u %5u %5u %5u\n",
// gpu->natoms, gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, gpu->sim.bf_reduce_threads_per_block,
// GF1XX_THREADS_PER_BLOCK, GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK); fflush( stderr );
kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling"); LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
} }
...@@ -441,7 +474,7 @@ void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, ...@@ -441,7 +474,7 @@ void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId,
sigEps4->Download(); sigEps4->Download();
(void) fprintf( log, "kPrintGBViSoftcore Cuda comp bR bF swd prm sigeps4\n" ); (void) fprintf( log, "kPrintGBViSoftcore Cuda comp bR bF swd prm sigeps4\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n", (void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
ii, ii,
gpu->psBornRadii->_pSysData[ii], gpu->psBornRadii->_pSysData[ii],
...@@ -474,10 +507,28 @@ extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int ...@@ -474,10 +507,28 @@ extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int
void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu ) void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
{ {
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext; gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreBornSum method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
#ifdef DEBUG #ifdef DEBUG
fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr ); fprintf( stderr, "kCalculateGBVISoftcoreBornSum blks=%u threadsPerBlock=%u %u shMem=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, (sizeof(Atom)+sizeof(float))*threadsPerBlock ); fflush( stderr );
int psize = gpu->sim.paddedNumberOfAtoms; int psize = gpu->sim.paddedNumberOfAtoms;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
...@@ -511,19 +562,19 @@ pdE2->Upload(); ...@@ -511,19 +562,19 @@ pdE2->Upload();
#ifdef DEBUG #ifdef DEBUG
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
} else { } else {
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
} }
#else #else
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
} else { } else {
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
} }
#endif #endif
...@@ -536,31 +587,31 @@ pdE2->Upload(); ...@@ -536,31 +587,31 @@ pdE2->Upload();
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff"); LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#ifdef DEBUG #ifdef DEBUG
(void) fprintf( stderr, "kCalculateGBVISoftcoreBornSum cutoff=%15.7e warp=%u GridBoundingBox.length=%u interaction_blocks=%u interaction_threads_per_block=%u nonbond_blocks=%u nonbond_threads_per_block=%u\n", (void) fprintf( stderr, "kCalculateGBVISoftcoreBornSum cutoff=%15.7e warp=%u GridBoundingBox.length=%u interaction_blocks=%u interaction_threads_per_block=%u nonbond_blocks=%u nonbond_threads_per_block=%u\n",
gpu->sim.nonbondedCutoffSqr, gpu->bOutputBufferPerWarp, gpu->psGridBoundingBox->_length, gpu->sim.interaction_blocks, gpu->sim.nonbondedCutoffSqr, gpu->bOutputBufferPerWarp, gpu->psGridBoundingBox->_length, gpu->sim.interaction_blocks,
gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block ); fflush( stderr ); gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, threadsPerBlock ); fflush( stderr );
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
else else
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
#else #else
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else else
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
#endif #endif
break;
case FREE_ENERGY_PERIODIC: case FREE_ENERGY_PERIODIC:
...@@ -569,23 +620,23 @@ pdE2->Upload(); ...@@ -569,23 +620,23 @@ pdE2->Upload();
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>(); kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic"); LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount); compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit); sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
#ifdef DEBUG #ifdef DEBUG
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
else else
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
#else #else
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
#endif #endif
break; break;
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#include "kCalculateGBVIAux.h" #include "kCalculateGBVIAux.h"
#undef TARGET #undef TARGET
//#define TARGET 5443 //#define TARGET 39
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
......
...@@ -29,6 +29,10 @@ ...@@ -29,6 +29,10 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h> #include <stdio.h>
#include <cuda.h> #include <cuda.h>
#include <vector_functions.h> #include <vector_functions.h>
...@@ -36,10 +40,8 @@ ...@@ -36,10 +40,8 @@
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
using namespace std;
#include "gputypes.h" using namespace std;
#include "GpuFreeEnergyCudaKernels.h"
struct Atom { struct Atom {
float x; float x;
...@@ -126,8 +128,6 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel() ...@@ -126,8 +128,6 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
} }
#define TARGET 0
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b #define METHOD_NAME(a, b) a##N2##b
...@@ -163,18 +163,28 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel() ...@@ -163,18 +163,28 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu ) void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
{ {
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext; gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
/*fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%d bornForce2_threads_per_block=%d\n", if( methodIndex > 2 ){
freeEnergyGpu->freeEnergySim.nonbondedMethod, throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreForces2 method index invalid." );
gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, gpu->psInteractionCount->_pSysData[0] ); fflush( stderr ); }
*/
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod) if( natoms[methodIndex] != gpu->natoms ){
{ unsigned int extra = methodIndex == 0 ? 0 : sizeof(float3);
case FREE_ENERGY_NO_CUTOFF: threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
#ifdef DEBUG #ifdef DEBUG
fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%u threadsPerBlock=%u shMem=%u\n",
freeEnergyGpu->freeEnergySim.nonbondedMethod,
gpu->sim.bornForce2_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float3))*threadsPerBlock ); fflush( stderr );
int psize = 64; int psize = 64;
CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE"); CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
...@@ -194,11 +204,11 @@ pdE1->Upload(); ...@@ -194,11 +204,11 @@ pdE1->Upload();
pdE2->Upload(); pdE2->Upload();
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
else else
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
pdE1->Download(); pdE1->Download();
pdE2->Download(); pdE2->Download();
fprintf( stderr, "Pde\n" ); fprintf( stderr, "Pde\n" );
...@@ -210,32 +220,37 @@ fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e ...@@ -210,32 +220,37 @@ fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e
break; break;
#endif #endif
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
else else
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit); sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
break; break;
case FREE_ENERGY_CUTOFF: case FREE_ENERGY_CUTOFF:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break; break;
case FREE_ENERGY_PERIODIC: case FREE_ENERGY_PERIODIC:
if (gpu->bOutputBufferPerWarp) if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else else
kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit ); (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break; break;
} }
......
...@@ -49,8 +49,11 @@ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit ...@@ -49,8 +49,11 @@ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit
{ {
//METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 ) //METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
//unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
//unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
...@@ -191,7 +194,8 @@ if( (x+j) == TARGET ){ ...@@ -191,7 +194,8 @@ if( (x+j) == TARGET ){
{ {
// No interactions in this block. // No interactions in this block.
} }
else if (flags == 0xFFFFFFFF) //else if (flags == 0xFFFFFFFF)
else if (flags)
#endif #endif
{ {
// Compute all interactions within this block. // Compute all interactions within this block.
......
...@@ -275,6 +275,7 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -275,6 +275,7 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psObcData)[ii].x = radius[ii] - dielectricOffset; (*gpu->psObcData)[ii].x = radius[ii] - dielectricOffset;
(*gpu->psObcData)[ii].y = scale[ii] * (*gpu->psObcData)[ii].x; (*gpu->psObcData)[ii].y = scale[ii] * (*gpu->psObcData)[ii].x;
(*gpu->psPosq4)[ii].w = charge[ii]; (*gpu->psPosq4)[ii].w = charge[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii]; (*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
} }
...@@ -302,12 +303,14 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn ...@@ -302,12 +303,14 @@ void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
(*gpu->psObcData)[ii].x = 0.01f; (*gpu->psObcData)[ii].x = 0.01f;
(*gpu->psObcData)[ii].y = 0.01f; (*gpu->psObcData)[ii].y = 0.01f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = 0.0f; (*freeEnergyGpu->psNonPolarScalingFactors)[ii] = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
} }
// load data to board // load data to board
gpu->psObcData->Upload(); gpu->psObcData->Upload();
gpu->psPosq4->Upload(); gpu->psPosq4->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psNonPolarScalingFactors->Upload(); freeEnergyGpu->psNonPolarScalingFactors->Upload();
return; return;
...@@ -330,7 +333,7 @@ void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string call ...@@ -330,7 +333,7 @@ void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string call
sigEps4->Download(); sigEps4->Download();
(void) fprintf( log, "BornSum Born radii & params\n" ); (void) fprintf( log, "BornSum Born radii & params\n" );
for( int ii = 0; ii < gpu->natoms; ii++ ){ for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
//(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n", //(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n", (void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
ii, ii,
......
...@@ -799,8 +799,8 @@ int main() { ...@@ -799,8 +799,8 @@ int main() {
#ifdef USE_SOFTCORE #ifdef USE_SOFTCORE
DoubleVector lamda2; DoubleVector lamda2;
lamda2.push_back( 1.0 ); lamda2.push_back( 1.0 );
lamda2.push_back( 0.5 ); //lamda2.push_back( 0.5 );
lamda2.push_back( 0.0 ); //lamda2.push_back( 0.0 );
if( lamda2.size() > 0 ){ if( lamda2.size() > 0 ){
generativeArgumentMaps["lambda2"] = lamda2; generativeArgumentMaps["lambda2"] = lamda2;
inputArgumentMap["lambda2"] = lamda2[0]; inputArgumentMap["lambda2"] = lamda2[0];
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
*/ */
#include <string.h> #include <string.h>
#include <stdio.h>
#include <math.h> #include <math.h>
#include <sstream> #include <sstream>
#include <vector> #include <vector>
...@@ -534,7 +535,6 @@ RealOpenMM CpuGBVISoftcore::computeBornEnergy( const vector<RealOpenMM>& bornRad ...@@ -534,7 +535,6 @@ RealOpenMM CpuGBVISoftcore::computeBornEnergy( const vector<RealOpenMM>& bornRad
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, vector<RealVec>& atomCoordinates, void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, vector<RealVec>& atomCoordinates,
const RealOpenMM* partialCharges, vector<RealVec>& inputForces ){ const RealOpenMM* partialCharges, vector<RealVec>& inputForces ){
...@@ -569,6 +569,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -569,6 +569,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
// first main loop // first main loop
#undef TARGET
#define TARGET -1926
RealOpenMMVector bornForcesT( numberOfAtoms, 0.0 );
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){ for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
// partial of polar term wrt Born radius // partial of polar term wrt Born radius
...@@ -606,6 +610,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -606,6 +610,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
bornForces[atomJ] += dGpol_dalpha2_ij*bornRadii[atomI]; bornForces[atomJ] += dGpol_dalpha2_ij*bornRadii[atomI];
if( atomJ == TARGET ){
bornForcesT[atomI] = dGpol_dalpha2_ij*bornRadii[atomI];
}
deltaX *= dGpol_dr; deltaX *= dGpol_dr;
deltaY *= dGpol_dr; deltaY *= dGpol_dr;
deltaZ *= dGpol_dr; deltaZ *= dGpol_dr;
...@@ -620,6 +628,9 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -620,6 +628,9 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
} }
bornForces[atomI] += dGpol_dalpha2_ij*bornRadii[atomJ]; bornForces[atomI] += dGpol_dalpha2_ij*bornRadii[atomJ];
if( atomI == TARGET ){
bornForcesT[atomJ] = dGpol_dalpha2_ij*bornRadii[atomJ];
}
} }
} }
...@@ -634,6 +645,25 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -634,6 +645,25 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
RealOpenMMVector& switchDeriviative = getSwitchDeriviative(); RealOpenMMVector& switchDeriviative = getSwitchDeriviative();
const RealOpenMMVector& bornRadiusScaleFactors = gbviParameters->getBornRadiusScaleFactors(); const RealOpenMMVector& bornRadiusScaleFactors = gbviParameters->getBornRadiusScaleFactors();
if( 0 ){
double bF = 0.0;
int count = 0;
fprintf( stderr, "PdeRef %d\n", TARGET );
RealOpenMM tau = static_cast<RealOpenMM>(gbviParameters->getTau());
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
bF += bornForcesT[atomI];
if( fabs( bornForcesT[atomI] ) > 0.0 ){
count++;
fprintf( stderr, "%6d %6d bFAdd=%15.7e bR=%15.7e\n", atomI, count, tau*bornForcesT[atomI], bornRadii[atomI] );
}
}
RealOpenMM ratio = (atomicRadii[TARGET]/bornRadii[TARGET]);
RealOpenMM bF1 = bF + (three*gammaParameters[TARGET]*ratio*ratio*ratio)/bornRadii[TARGET];
RealOpenMM b2 = bornRadii[TARGET]*bornRadii[TARGET];
bF1 *= switchDeriviative[TARGET]*oneThird*b2*b2;
fprintf( stderr, "sumbF Ref %6d %15.7e %15.7e %15.7e %15.7e\n", TARGET, bF, tau*bF1, bornRadii[TARGET], switchDeriviative[TARGET] );
}
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){ for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
...@@ -708,7 +738,7 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -708,7 +738,7 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
} }
//printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "GBVI softcore post loop2", stderr ); //printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "Reference: GBVI softcore post loop2", stderr );
// apply prefactor tau = (1/diel_solute - 1/diel_solvent) // apply prefactor tau = (1/diel_solute - 1/diel_solvent)
...@@ -735,7 +765,8 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve ...@@ -735,7 +765,8 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges, void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates,
const RealOpenMM* partialCharges,
const RealOpenMMVector& bornRadii, const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces, const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces, const std::vector<OpenMM::RealVec>& forces,
......
...@@ -317,7 +317,7 @@ class CpuGBVISoftcore { ...@@ -317,7 +317,7 @@ class CpuGBVISoftcore {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges, void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMM* partialCharges,
const RealOpenMMVector& bornRadii, const RealOpenMMVector& bornRadii,
const RealOpenMMVector& bornForces, const RealOpenMMVector& bornForces,
const std::vector<OpenMM::RealVec>& forces, const std::vector<OpenMM::RealVec>& forces,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment