/* -------------------------------------------------------------------------- * * OpenMMFreeEnergy * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman, Mark Friedrichs * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ #ifdef WIN32 #define _USE_MATH_DEFINES /* M_PI */ #endif #define PARAMETER_PRINT 1 #define MAX_PARAMETER_PRINT 10 #include "openmm/OpenMMException.h" #include "cudaKernels.h" #include "GpuFreeEnergyCudaKernels.h" #include "freeEnergyGpuTypes.h" // for some reason, these are not being included w/ cudaKernels.h on Windows //extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu); #include #include #include #include #include #include #include #ifdef WIN32 #include #else #include #endif using std::vector; extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu ){ // allocate and zero block freeEnergyGpuContext freeEnergyGpu = new _freeEnergyGpuContext; memset( freeEnergyGpu, 0, sizeof( struct _freeEnergyGpuContext ) ); freeEnergyGpu->gpuContext = gpu; return freeEnergyGpu; } extern "C" void gpuPrintCudaStream( std::string name, unsigned int length, unsigned int subStreams, unsigned int stride, unsigned int memoryFootprint, void* pSysStream, void* pDevStream, void* pSysData, void* pDevData, FILE* log) { (void) fprintf( log, " %-35s [%8u %5u %8u %8u] Stream[%p %p] Data[%16p %16p]\n", name.c_str(), length, subStreams, stride, memoryFootprint, pSysStream, pDevStream, pSysData, pDevData ); } extern "C" int gpuPrintCudaStreamFloat( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ); } extern "C" int gpuPrintCudaStreamFloat2( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float ); } extern "C" int gpuPrintCudaStreamFloat4( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float ); } extern "C" int gpuPrintCudaStreamUnsignedInt( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ); } extern "C" int gpuPrintCudaStreamInt( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ); } extern "C" int gpuPrintCudaStreamInt2( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int ); } extern "C" int gpuPrintCudaStreamInt4( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int ); } extern "C" void gpuPrintCudaFreeEnergyGmxSimulation(freeEnergyGpuContext freeEnergyGpu, FILE* log ) { if( log == NULL )return; _gpuContext* gpu = freeEnergyGpu->gpuContext; int totalMemory = 0; (void) fprintf( log, "cudaFreeEnergyGmxSimulation:\n\n" ); (void) fprintf( log, "\n" ); (void) fprintf( log, " numberOfAtoms %u\n", gpu->natoms ); (void) fprintf( log, " paddedNumberOfAtoms %u\n", gpu->sim.paddedNumberOfAtoms ); (void) fprintf( log, "\n\n" ); (void) fprintf( log, " gpuContext %p\n", freeEnergyGpu->gpuContext ); (void) fprintf( log, " log %p %s\n", freeEnergyGpu->log, freeEnergyGpu->log == stderr ? "is stderr" : "is not stderr"); (void) fprintf( log, " sm_version %u\n", gpu->sm_version ); (void) fprintf( log, " device %u\n", gpu->device ); (void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock ); (void) fprintf( log, " bOutputBufferPerWarp %d\n", gpu->bOutputBufferPerWarp ); (void) fprintf( log, " blocks %u\n", gpu->sim.blocks ); (void) fprintf( log, " threads_per_block %u\n", gpu->sim.threads_per_block); (void) fprintf( log, " update_threads_per_block %u\n", gpu->sim.update_threads_per_block); (void) fprintf( log, " nonbondBlocks %u\n", gpu->sim.nonbond_blocks ); (void) fprintf( log, " nonbondThreadsPerBlock %u\n", gpu->sim.nonbond_threads_per_block); (void) fprintf( log, " bsf_reduce_threads_per_block %u\n", gpu->sim.bsf_reduce_threads_per_block); (void) fprintf( log, " nonbondOutputBuffers %u\n", gpu->sim.nonbondOutputBuffers ); (void) fprintf( log, " outputBuffers %u\n", gpu->sim.outputBuffers ); totalMemory += gpuPrintCudaStreamFloat( freeEnergyGpu->gpuContext->psEnergy, log ); totalMemory += gpuPrintCudaStreamFloat4( freeEnergyGpu->gpuContext->psForce4, log ); (void) fflush( log ); } extern "C" void freeEnergyGpuShutDown( freeEnergyGpuContext freeEnergyGpu ){ if( freeEnergyGpu->log ){ (void) fprintf( freeEnergyGpu->log, "freeEnergyGpuShutDown called.\n" ); (void) fflush( freeEnergyGpu->log ); } // free free energy Cuda arrays delete freeEnergyGpu->psLJ14ID; delete freeEnergyGpu->psLJ14Parameter; delete freeEnergyGpu->psSigEps4; delete freeEnergyGpu->psSwitchDerivative; delete freeEnergyGpu->psNonPolarScalingFactors; delete freeEnergyGpu; return; } extern "C" void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){ if( freeEnergyGpu->log ){ (void) fprintf( freeEnergyGpu->log, "FreeEnergyGpuSetConstants called\n" ); (void) fflush( freeEnergyGpu->log ); } SetCalculateLocalSoftcoreGpuSim( freeEnergyGpu ); SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpu ); SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpu ); SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpu ); SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpu ); SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpu ); SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpu ); } extern "C" void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext freeEnergyGpu, float xsize, float ysize, float zsize) { freeEnergyGpu->freeEnergySim.periodicBoxSizeX = xsize; freeEnergyGpu->freeEnergySim.periodicBoxSizeY = ysize; freeEnergyGpu->freeEnergySim.periodicBoxSizeZ = zsize; freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeX = 1.0f/xsize; freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeY = 1.0f/ysize; freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeZ = 1.0f/zsize; freeEnergyGpu->freeEnergySim.recipBoxSizeX = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeX; freeEnergyGpu->freeEnergySim.recipBoxSizeY = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeY; freeEnergyGpu->freeEnergySim.recipBoxSizeZ = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeZ; freeEnergyGpu->freeEnergySim.cellVolume = freeEnergyGpu->freeEnergySim.periodicBoxSizeX*freeEnergyGpu->freeEnergySim.periodicBoxSizeY*freeEnergyGpu->freeEnergySim.periodicBoxSizeZ; gpuSetPeriodicBoxSize( freeEnergyGpu->gpuContext, xsize, ysize, zsize ); } extern "C" void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float epsfac, const std::vector& atom, const std::vector& c6, const std::vector& c12, const std::vector& q, const std::vector& softcoreLJLambdaArray, const std::vector& symbol, const std::vector >& exclusions, CudaFreeEnergyNonbondedMethod method, float cutoffDistance, float solventDielectric ){ unsigned int numberOfParticles = c6.size(); gpuContext gpu = freeEnergyGpu->gpuContext; int paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms; // sanity checks if( paddedNumberOfAtoms < 1 ){ std::stringstream msg; msg << "gpuSetNonbondedSoftcoreParameters: number of padded atoms=" << gpu->sim.paddedNumberOfAtoms << " is less than 1."; throw OpenMM::OpenMMException( msg.str() ); } if( freeEnergyGpu->gpuContext->sim.atoms != numberOfParticles ){ std::stringstream msg; msg << "gpuSetNonbondedSoftcoreParameters: number of atoms in gpuContext does not match input count: " << freeEnergyGpu->gpuContext->sim.atoms << " " << numberOfParticles << "."; throw OpenMM::OpenMMException( msg.str() ); } freeEnergyGpu->freeEnergySim.epsfac = epsfac; freeEnergyGpu->freeEnergySim.nonbondedMethod = method; freeEnergyGpu->freeEnergySim.nonbondedCutoff = cutoffDistance; freeEnergyGpu->freeEnergySim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance; gpu->sim.nonbondedCutoff = cutoffDistance; gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance; if( cutoffDistance > 0.0f ){ freeEnergyGpu->freeEnergySim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f); freeEnergyGpu->freeEnergySim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f); gpu->sim.reactionFieldK = freeEnergyGpu->freeEnergySim.reactionFieldK; gpu->sim.reactionFieldC = freeEnergyGpu->freeEnergySim.reactionFieldC; } else { freeEnergyGpu->freeEnergySim.reactionFieldK = 0.0f; freeEnergyGpu->freeEnergySim.reactionFieldC = 0.0f; } setExclusions( gpu, exclusions ); // parameters freeEnergyGpu->psSigEps4 = new CUDAStream( paddedNumberOfAtoms, 1, "freeEnergyGpuSigEps4"); freeEnergyGpu->freeEnergySim.pSigEps4 = freeEnergyGpu->psSigEps4->_pDevData; for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){ float p1 = 0.5f; float p2 = 0.0f; if( (c6[ii] > 0.0f) && (c12[ii] > 0.0f) ){ p1 = 0.5f * powf(c12[ii] / c6[ii], 1.0f / 6.0f); p2 = c6[ii] * sqrtf(1.0f / c12[ii]); } /* if (symbol.size() > 0) freeEnergyGpu->pAtomSymbol[ii] = symbol[ii]; */ (*freeEnergyGpu->psSigEps4)[ii].x = p1; (*freeEnergyGpu->psSigEps4)[ii].y = p2; (*freeEnergyGpu->psSigEps4)[ii].z = softcoreLJLambdaArray[ii]; (*freeEnergyGpu->psSigEps4)[ii].w = q[ii]; } // Dummy out extra atom data for( unsigned int ii = numberOfParticles; ii < paddedNumberOfAtoms; ii++ ){ (*freeEnergyGpu->psSigEps4)[ii].x = 1.0f; (*freeEnergyGpu->psSigEps4)[ii].y = 0.0f; (*freeEnergyGpu->psSigEps4)[ii].z = 0.0f; (*freeEnergyGpu->psSigEps4)[ii].w = 0.0f; (*gpu->psPosq4)[ii].x = 100000.0f + ii * 10.0f; (*gpu->psPosq4)[ii].y = 100000.0f + ii * 10.0f; (*gpu->psPosq4)[ii].z = 100000.0f + ii * 10.0f; (*gpu->psPosq4)[ii].w = 0.0f; } if( freeEnergyGpu->log ){ (void) fprintf( freeEnergyGpu->log,"freeEnergyGpuSetNonbondedSoftcoreParameters: %5u padded=%u epsfac=%14.7e method=%d cutoffDistance=%9.2f solventDielectric=%9.2f\n", numberOfParticles, freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms, epsfac, method, cutoffDistance, solventDielectric ); #ifdef PARAMETER_PRINT int maxPrint = MAX_PARAMETER_PRINT; for (unsigned int ii = 0; ii < numberOfParticles; ii++){ (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n", ii, (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w ); if( ii == maxPrint && ii < freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms - maxPrint ){ ii = numberOfParticles - maxPrint; } } unsigned int offset = paddedNumberOfAtoms - maxPrint; if( offset > 0 ){ if( offset > numberOfParticles ){ (void) fprintf( freeEnergyGpu->log,"Dummy padded entries\n" ); for (unsigned int ii = offset; ii < paddedNumberOfAtoms; ii++){ (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n", ii, (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w ); } } } #endif (void) fflush( freeEnergyGpu->log ); } // upload data to board freeEnergyGpu->psSigEps4->Upload(); gpu->psPosq4->Upload(); return; } /**--------------------------------------------------------------------------------------- Get threads/block @param amoebaGpu amoebaGpuContext @param sharedMemoryPerThread shared memory/thread @param sharedMemoryPerBlock shared memory/block @return threadsPerBlock --------------------------------------------------------------------------------------- */ extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock ) { unsigned int grid = freeEnergyGpu->gpuContext->grid; unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread); threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock; threadsPerBlock *= grid; return threadsPerBlock; } static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){ *x = cellCode >> 17; *y = (cellCode >> 2 ) & 0x7FFF; *exclusion = (cellCode & 1) ? 1 : 0; return 0; } void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit ){ gpuContext gpu = freeEnergyGpu->gpuContext; gpu->psWorkUnit->Download(); gpu->psInteractingWorkUnit->Download(); gpu->psInteractionFlag->Download(); unsigned int totalWarps = (gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID; //unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; //unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0]; (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n", totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, numWorkUnits, gpu->bOutputBufferPerWarp ); unsigned int maxPrint = 3; std::stringstream message; char buffer[2048]; unsigned int targetAtom = 18; for( unsigned int ii = 0; ii < gpu->sim.nonbond_blocks; ii++ ) { unsigned int blockId = ii; for( unsigned int jj = 0; jj < gpu->sim.nonbond_threads_per_block; jj++ ) { unsigned int warp = (ii*gpu->sim.nonbond_threads_per_block+jj)/GRID; unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int print = 0; while( pos < end ){ unsigned int x, y, exclusion, flags; int flagInt; if( interactingWorkUnit ){ decodeCell( gpu->psInteractingWorkUnit->_pSysData[pos], &x, &y, &exclusion ); flags = gpu->psInteractionFlag->_pSysData[pos]; if( flags == 0xFFFFFFFF ){ flagInt = -2; } else { flagInt = flags; } } else { decodeCell( gpu->psWorkUnit->_pSysData[pos], &x, &y, &exclusion ); flagInt = -1; } x *= GRID; y *= GRID; unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride; unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride; bufferX /= gpu->sim.paddedNumberOfAtoms; bufferY /= gpu->sim.paddedNumberOfAtoms; if( jj == 1 ){ //if( bufferX >= 62 || bufferY >= 62 ){ (void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY ); message << buffer; (void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion ); message << buffer; if( interactingWorkUnit ){ (void) sprintf( buffer, " Flg=%d (-2=all) %u", flagInt, flags ); } message << buffer; message << std::endl; } pos++; } } } (void) fprintf( stderr, "%s\n\n", message.str().c_str() ); }