/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "amoebaScaleFactors.h" __global__ #if (__CUDA_ARCH__ >= 200) __launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1) #elif (__CUDA_ARCH__ >= 120) __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1) #else __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) #endif void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit ){ extern __shared__ GrycukChainRuleParticle sAChainRule[]; unsigned int totalWarps = gridDim.x*blockDim.x/GRID; unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID; unsigned int numWorkUnits = cSim.pInteractionCount[0]; unsigned int pos = warp*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int lasty = 0xFFFFFFFF; while (pos < end) { unsigned int x; unsigned int y; bool bExclusionFlag; // Extract cell coordinates decodeCell( workUnit[pos], &x, &y, &bExclusionFlag ); unsigned int tgx = threadIdx.x & (GRID - 1); unsigned int tbx = threadIdx.x - tgx; unsigned int tj = tgx; GrycukChainRuleParticle* psAChainRule = &sAChainRule[tbx]; unsigned int atomI = x + tgx; GrycukChainRuleParticle localParticle; loadGrycukChainRuleParticleShared( &localParticle, atomI ); zeroGrycukChainRuleParticleSharedField( &localParticle ); if (x == y){ // load shared data and zero force loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomI ); zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x])); for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1)) { float localForce[3]; calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce); if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){ localParticle.force[0] -= localForce[0]; localParticle.force[1] -= localForce[1]; localParticle.force[2] -= localForce[2]; psAChainRule[j].force[0] += localForce[0]; psAChainRule[j].force[1] += localForce[1]; psAChainRule[j].force[2] += localForce[2]; } } // Write results float4 of; #ifdef USE_OUTPUT_BUFFER_PER_WARP unsigned int offset = x + tgx + warp*cSim.stride; #else unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride; #endif of = cSim.pForce4[offset]; of.x += localParticle.force[0] + sAChainRule[threadIdx.x].force[0]; of.y += localParticle.force[1] + sAChainRule[threadIdx.x].force[1]; of.z += localParticle.force[2] + sAChainRule[threadIdx.x].force[2]; cSim.pForce4[offset] = of; } else { if (lasty != y) { unsigned int atomJ = y + tgx; loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomJ ); } // zero shared fields zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x]) ); for (unsigned int j = 0; j < GRID; j++) { if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){ float localForce[3]; calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce ); localParticle.force[0] -= localForce[0]; localParticle.force[1] -= localForce[1]; localParticle.force[2] -= localForce[2]; psAChainRule[tj].force[0] += localForce[0]; psAChainRule[tj].force[1] += localForce[1]; psAChainRule[tj].force[2] += localForce[2]; calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce); localParticle.force[0] += localForce[0]; localParticle.force[1] += localForce[1]; localParticle.force[2] += localForce[2]; psAChainRule[tj].force[0] -= localForce[0]; psAChainRule[tj].force[1] -= localForce[1]; psAChainRule[tj].force[2] -= localForce[2]; } tj = (tj + 1) & (GRID - 1); } // Write results float4 of; #ifdef USE_OUTPUT_BUFFER_PER_WARP unsigned int offset = x + tgx + warp*cSim.stride; #else unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride; #endif of = cSim.pForce4[offset]; of.x += localParticle.force[0]; of.y += localParticle.force[1]; of.z += localParticle.force[2]; cSim.pForce4[offset] = of; #ifdef USE_OUTPUT_BUFFER_PER_WARP offset = y + tgx + warp*cSim.stride; #else offset = y + tgx + (x >> GRIDBITS) * cSim.stride; #endif of = cSim.pForce4[offset]; of.x += sAChainRule[threadIdx.x].force[0]; of.y += sAChainRule[threadIdx.x].force[1]; of.z += sAChainRule[threadIdx.x].force[2]; cSim.pForce4[offset] = of; lasty = y; } pos++; } }