/* -------------------------------------------------------------------------- * * OpenMMFreeEnergy * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman, Mark Friedrichs * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ #ifdef WIN32 #define _USE_MATH_DEFINES /* M_PI */ #endif #define PARAMETER_PRINT 1 #define MAX_PARAMETER_PRINT 10 #include "openmm/OpenMMException.h" #include "cudaKernels.h" #include "GpuFreeEnergyCudaKernels.h" #include "freeEnergyGpuTypes.h" // for some reason, these are not being included w/ cudaKernels.h on Windows //extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu); #include #include #include #include #include #include #ifdef WIN32 #include #else #include #endif using std::vector; extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu ){ // allocate and zero block freeEnergyGpuContext freeEnergyGpu = new _freeEnergyGpuContext; memset( freeEnergyGpu, 0, sizeof( struct _freeEnergyGpuContext ) ); freeEnergyGpu->gpuContext = gpu; return freeEnergyGpu; } extern "C" void gpuPrintCudaStream( std::string name, unsigned int length, unsigned int subStreams, unsigned int stride, unsigned int memoryFootprint, void* pSysStream, void* pDevStream, void* pSysData, void* pDevData, FILE* log) { (void) fprintf( log, " %-35s [%8u %5u %8u %8u] Stream[%p %p] Data[%16p %16p]\n", name.c_str(), length, subStreams, stride, memoryFootprint, pSysStream, pDevStream, pSysData, pDevData ); } extern "C" int gpuPrintCudaStreamFloat( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ); } extern "C" int gpuPrintCudaStreamFloat2( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float ); } extern "C" int gpuPrintCudaStreamFloat4( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float ); } extern "C" int gpuPrintCudaStreamUnsignedInt( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ); } extern "C" int gpuPrintCudaStreamInt( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ); } extern "C" int gpuPrintCudaStreamInt2( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int ); } extern "C" int gpuPrintCudaStreamInt4( CUDAStream* cUDAStream, FILE* log ) { if( cUDAStream == NULL )return 0; gpuPrintCudaStream( cUDAStream->_name.c_str(), cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride, cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ), static_cast(cUDAStream->_pSysStream), static_cast(cUDAStream->_pDevStream), static_cast(cUDAStream->_pSysData), static_cast(cUDAStream->_pDevData), log ); return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int ); } extern "C" void gpuPrintCudaFreeEnergyGmxSimulation(freeEnergyGpuContext freeEnergyGpu, FILE* log ) { if( log == NULL )return; _gpuContext* gpu = freeEnergyGpu->gpuContext; int totalMemory = 0; (void) fprintf( log, "cudaFreeEnergyGmxSimulation:\n\n" ); (void) fprintf( log, "\n" ); (void) fprintf( log, " numberOfAtoms %u\n", gpu->natoms ); (void) fprintf( log, " paddedNumberOfAtoms %u\n", gpu->sim.paddedNumberOfAtoms ); (void) fprintf( log, "\n\n" ); (void) fprintf( log, " gpuContext %p\n", freeEnergyGpu->gpuContext ); (void) fprintf( log, " log %p %s\n", freeEnergyGpu->log, freeEnergyGpu->log == stderr ? "is stderr" : "is not stderr"); (void) fprintf( log, " sm_version %u\n", gpu->sm_version ); (void) fprintf( log, " device %u\n", gpu->device ); (void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock ); (void) fprintf( log, " bOutputBufferPerWarp %d\n", gpu->bOutputBufferPerWarp ); (void) fprintf( log, " blocks %u\n", gpu->sim.blocks ); (void) fprintf( log, " threads_per_block %u\n", gpu->sim.threads_per_block); (void) fprintf( log, " update_threads_per_block %u\n", gpu->sim.update_threads_per_block); (void) fprintf( log, " nonbondBlocks %u\n", gpu->sim.nonbond_blocks ); (void) fprintf( log, " nonbondThreadsPerBlock %u\n", gpu->sim.nonbond_threads_per_block); (void) fprintf( log, " bsf_reduce_threads_per_block %u\n", gpu->sim.bsf_reduce_threads_per_block); (void) fprintf( log, " nonbondOutputBuffers %u\n", gpu->sim.nonbondOutputBuffers ); (void) fprintf( log, " outputBuffers %u\n", gpu->sim.outputBuffers ); totalMemory += gpuPrintCudaStreamFloat( freeEnergyGpu->gpuContext->psEnergy, log ); totalMemory += gpuPrintCudaStreamFloat4( freeEnergyGpu->gpuContext->psForce4, log ); (void) fflush( log ); } extern "C" void freeEnergyGpuShutDown( freeEnergyGpuContext freeEnergyGpu ){ if( freeEnergyGpu->log ){ (void) fprintf( freeEnergyGpu->log, "freeEnergyGpuShutDown called.\n" ); (void) fflush( freeEnergyGpu->log ); } // free free energy Cuda arrays delete freeEnergyGpu->psLJ14ID; delete freeEnergyGpu->psLJ14Parameter; delete freeEnergyGpu->psSigEps4; delete freeEnergyGpu->psSwitchDerivative; delete freeEnergyGpu->psNonPolarScalingFactors; delete freeEnergyGpu; return; } extern "C" void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){ if( freeEnergyGpu->log ){ (void) fprintf( freeEnergyGpu->log, "FreeEnergyGpuSetConstants called\n" ); (void) fflush( freeEnergyGpu->log ); } SetCalculateLocalSoftcoreGpuSim( freeEnergyGpu ); SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpu ); SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpu ); SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpu ); SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpu ); SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpu ); SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpu ); } /**--------------------------------------------------------------------------------------- Get threads/block @param amoebaGpu amoebaGpuContext @param sharedMemoryPerThread shared memory/thread @param sharedMemoryPerBlock shared memory/block @return threadsPerBlock --------------------------------------------------------------------------------------- */ extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock ) { unsigned int grid = freeEnergyGpu->gpuContext->grid; unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread); threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock; threadsPerBlock *= grid; return threadsPerBlock; } static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){ *x = cellCode >> 17; *y = (cellCode >> 2 ) & 0x7FFF; *exclusion = (cellCode & 1) ? 1 : 0; return 0; } void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit ){ gpuContext gpu = freeEnergyGpu->gpuContext; gpu->psWorkUnit->Download(); gpu->psInteractingWorkUnit->Download(); gpu->psInteractionFlag->Download(); unsigned int totalWarps = (gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID; //unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; //unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0]; (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n", totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, numWorkUnits, gpu->bOutputBufferPerWarp ); unsigned int maxPrint = 3; std::stringstream message; char buffer[2048]; unsigned int targetAtom = 18; for( unsigned int ii = 0; ii < gpu->sim.nonbond_blocks; ii++ ) { unsigned int blockId = ii; for( unsigned int jj = 0; jj < gpu->sim.nonbond_threads_per_block; jj++ ) { unsigned int warp = (ii*gpu->sim.nonbond_threads_per_block+jj)/GRID; unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int print = 0; while( pos < end ){ unsigned int x, y, exclusion, flags; int flagInt; if( interactingWorkUnit ){ decodeCell( gpu->psInteractingWorkUnit->_pSysData[pos], &x, &y, &exclusion ); flags = gpu->psInteractionFlag->_pSysData[pos]; if( flags == 0xFFFFFFFF ){ flagInt = -2; } else { flagInt = flags; } } else { decodeCell( gpu->psWorkUnit->_pSysData[pos], &x, &y, &exclusion ); flagInt = -1; } x *= GRID; y *= GRID; unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride; unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride; bufferX /= gpu->sim.paddedNumberOfAtoms; bufferY /= gpu->sim.paddedNumberOfAtoms; if( jj == 1 ){ //if( bufferX >= 62 || bufferY >= 62 ){ (void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY ); message << buffer; (void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion ); message << buffer; if( interactingWorkUnit ){ (void) sprintf( buffer, " Flg=%d (-2=all) %u", flagInt, flags ); } message << buffer; message << std::endl; } pos++; } } } (void) fprintf( stderr, "%s\n\n", message.str().c_str() ); }