/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include #include #include #include #include #include #include using namespace std; #include "gputypes.h" #define FABS(a) ((a) > 0.0f ? (a) : -(a)) static __constant__ cudaGmxSimulation cSim; void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyToSymbol: SetForcesSim copy to cSim failed"); } void GetForcesSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed"); } __global__ __launch_bounds__(384, 1) void kClearForces_kernel() { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; while (pos < cSim.stride * cSim.outputBuffers) { cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); pos += gridDim.x * blockDim.x; } } void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu) { // printf("kClearForces\n"); kClearForces_kernel<<sim.blocks, 384>>>(); LAUNCHERROR("kClearForces"); } __global__ __launch_bounds__(384, 1) void kClearBornSumAndForces_kernel() { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; while (pos < cSim.stride * cSim.nonbondOutputBuffers) { cSim.pBornSum[pos] = 0.0f; cSim.pBornForce[pos] = 0.0f; cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); pos += gridDim.x * blockDim.x; } while (pos < cSim.stride * cSim.outputBuffers) { cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); pos += gridDim.x * blockDim.x; } } void kClearBornSumAndForces(gpuContext gpu) { // printf("kClearBornSumAndForces\n"); kClearBornSumAndForces_kernel<<sim.blocks, 384>>>(); LAUNCHERROR("kClearBornSumAndForces"); } __global__ __launch_bounds__(384, 1) void kClearEnergy_kernel() { unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; while (pos < cSim.energyOutputBuffers) { ((float*)cSim.pEnergy)[pos] = 0.0f; pos += gridDim.x * blockDim.x; } } void kClearEnergy(gpuContext gpu) { // printf("kClearEnergy\n"); kClearEnergy_kernel<<sim.blocks, 384>>>(); LAUNCHERROR("kClearEnergy"); } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kReduceBornSumAndForces_kernel() { unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); // Reduce forces while (pos < cSim.stride4) { float totalForce = 0.0f; float* pFt = (float*)cSim.pForce4 + pos; int i = cSim.outputBuffers; while (i >= 4) { float f1 = *pFt; pFt += cSim.stride4; float f2 = *pFt; pFt += cSim.stride4; float f3 = *pFt; pFt += cSim.stride4; float f4 = *pFt; pFt += cSim.stride4; totalForce += f1 + f2 + f3 + f4; i -= 4; } if (i >= 2) { float f1 = *pFt; pFt += cSim.stride4; float f2 = *pFt; pFt += cSim.stride4; totalForce += f1 + f2; i -= 2; } if (i > 0) { totalForce += *pFt; } pFt = (float*)cSim.pForce4 + pos; *pFt = totalForce; pos += gridDim.x * blockDim.x; } // Reduce Born Sum while (pos - cSim.stride4 < cSim.atoms) { float sum = 0.0f; float* pSt = cSim.pBornSum + pos - cSim.stride4; float2 atom = cSim.pObcData[pos - cSim.stride4]; // Get summed Born data int i = cSim.nonbondOutputBuffers; while (i >= 4) { float f1 = *pSt; pSt += cSim.stride; float f2 = *pSt; pSt += cSim.stride; float f3 = *pSt; pSt += cSim.stride; float f4 = *pSt; pSt += cSim.stride; sum += f1 + f2 + f3 + f4; i -= 4; } if (i >= 2) { float f1 = *pSt; pSt += cSim.stride; float f2 = *pSt; pSt += cSim.stride; sum += f1 + f2; i -= 2; } if (i > 0) { sum += *pSt; } // Now calculate Born radius and OBC term. cSim.pBornSum[pos - cSim.stride4] = sum; sum *= 0.5f * atom.x; float sum2 = sum * sum; float sum3 = sum * sum2; float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3); float nonOffsetRadii = atom.x + cSim.dielectricOffset; float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2); obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii; cSim.pBornRadii[pos - cSim.stride4] = bornRadius; cSim.pObcChain[pos - cSim.stride4] = obcChain; pos += gridDim.x * blockDim.x; } } void kReduceBornSumAndForces(gpuContext gpu) { //printf("kReduceBornSumAndForces\n"); kReduceBornSumAndForces_kernel<<sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); LAUNCHERROR("kReduceBornSumAndForces"); } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kReduceForces_kernel() { unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); // Reduce forces while (pos < cSim.stride4) { float totalForce = 0.0f; float* pFt = (float*)cSim.pForce4 + pos; int i = cSim.outputBuffers; while (i >= 4) { float f1 = *pFt; pFt += cSim.stride4; float f2 = *pFt; pFt += cSim.stride4; float f3 = *pFt; pFt += cSim.stride4; float f4 = *pFt; pFt += cSim.stride4; totalForce += f1 + f2 + f3 + f4; i -= 4; } if (i >= 2) { float f1 = *pFt; pFt += cSim.stride4; float f2 = *pFt; pFt += cSim.stride4; totalForce += f1 + f2; i -= 2; } if (i > 0) { totalForce += *pFt; } pFt = (float*)cSim.pForce4 + pos; *pFt = totalForce; pos += gridDim.x * blockDim.x; } } void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu) { // printf("kReduceForces\n"); kReduceForces_kernel<<sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); LAUNCHERROR("kReduceForces"); } double kReduceEnergy(gpuContext gpu) { //printf("kReduceEnergy\n"); gpu->psEnergy->Download(); double sum = 0.0; for (int i = 0; i < gpu->sim.energyOutputBuffers; i++){ sum += (*gpu->psEnergy)[i]; } return sum; } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kReduceObcGbsaBornForces_kernel() { unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); float energy = 0.0f; while (pos < cSim.atoms) { float bornRadius = cSim.pBornRadii[pos]; float obcChain = cSim.pObcChain[pos]; float2 obcData = cSim.pObcData[pos]; float totalForce = 0.0f; float* pFt = cSim.pBornForce + pos; int i = cSim.nonbondOutputBuffers; while (i >= 4) { float f1 = *pFt; pFt += cSim.stride; float f2 = *pFt; pFt += cSim.stride; float f3 = *pFt; pFt += cSim.stride; float f4 = *pFt; pFt += cSim.stride; totalForce += f1 + f2 + f3 + f4; i -= 4; } if (i >= 2) { float f1 = *pFt; pFt += cSim.stride; float f2 = *pFt; pFt += cSim.stride; totalForce += f1 + f2; i -= 2; } if (i > 0) { totalForce += *pFt; } float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius); float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f); float saTerm = cSim.surfaceAreaFactor * r * r * ratio6; totalForce += saTerm / bornRadius; totalForce *= bornRadius * bornRadius * obcChain; energy += saTerm; pFt = cSim.pBornForce + pos; *pFt = totalForce; pos += gridDim.x * blockDim.x; } // correct for surface area factor of -6 cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f; } __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_THREADS_PER_BLOCK, 1) #endif void kReduceGBVIBornForces_kernel() { unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); float energy = 0.0f; while (pos < cSim.atoms) { float bornRadius = cSim.pBornRadii[pos]; float4 gbviData = cSim.pGBVIData[pos]; float switchDeriv = cSim.pGBVISwitchDerivative[pos]; float totalForce = 0.0f; float* pFt = cSim.pBornForce + pos; int i = cSim.nonbondOutputBuffers; while (i >= 4) { float f1 = *pFt; pFt += cSim.stride; float f2 = *pFt; pFt += cSim.stride; float f3 = *pFt; pFt += cSim.stride; float f4 = *pFt; pFt += cSim.stride; totalForce += f1 + f2 + f3 + f4; i -= 4; } if (i >= 2) { float f1 = *pFt; pFt += cSim.stride; float f2 = *pFt; pFt += cSim.stride; totalForce += f1 + f2; i -= 2; } if (i > 0) { totalForce += *pFt; } float ratio = (gbviData.x/bornRadius); float ratio3 = ratio*ratio*ratio; energy -= gbviData.z*ratio3; totalForce += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term float br2 = bornRadius*bornRadius; totalForce *= (1.0f/3.0f)*br2*br2*switchDeriv; pFt = cSim.pBornForce + pos; *pFt = totalForce; pos += gridDim.x * blockDim.x; } cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy; } void kReduceObcGbsaBornForces(gpuContext gpu) { //fprintf( stderr, "kReduceObcGbsaBornForces %u %u %u %u %u\n", gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, GF1XX_THREADS_PER_BLOCK, // GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK ); fflush( stderr ); if( gpu->bIncludeGBSA ){ kReduceObcGbsaBornForces_kernel<<sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); LAUNCHERROR("kReduceObcGbsaBornForces"); } else if( gpu->bIncludeGBVI ){ kReduceGBVIBornForces_kernel<<sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>(); LAUNCHERROR("kReduceGBVIBornForces"); } }