/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include "amoebaCudaKernels.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaUtilitiesSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaUtilitiesSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaUtilitiesSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaUtilitiesSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaUtilitiesSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaUtilitiesSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
#undef METHOD_NAME
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef METHOD_NAME
#undef USE_PERIODIC
#undef METHOD_NAME
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocksVdw.h"
#undef METHOD_NAME
#undef USE_PERIODIC
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearFloat4_kernel( unsigned int bufferLength, float4* fieldToClear )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < bufferLength )
{
fieldToClear[pos].x = 0.0f;
fieldToClear[pos].y = 0.0f;
fieldToClear[pos].z = 0.0f;
fieldToClear[pos].w = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__
__launch_bounds__(384, 1)
void kClearBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
cSim.pBornSum[pos] = 0.0f;
cSim.pBornForce[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearBornSum(gpuContext gpu)
{
// printf("kClearBornSum\n");
kClearBornSum_kernel<<sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum");
}
void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream* fieldToClear )
{
kClearFloat4_kernel<<gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
LAUNCHERROR("kClearFloat4");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearFloat_kernel( unsigned int bufferLength, float* fieldToClear )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < bufferLength )
{
fieldToClear[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream* fieldToClear )
{
kClearFloat_kernel<<gpuContext->blocksPerSM, 384>>>( entries, fieldToClear->_pDevData );
LAUNCHERROR("kClearFloat");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearFields_kernel( unsigned int bufferLength, float* EField )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < bufferLength )
{
EField[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
// clear psWorkArray_3_1 & psWorkArray_3_2
void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear )
{
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_kernel<<sim.blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_1->_pDevData );
LAUNCHERROR("kClearFields_3_1");
if( numberToClear > 1 ){
kClearFields_kernel<<sim.nonbond_blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers, amoebaGpu->psWorkArray_3_2->_pDevData );
LAUNCHERROR("kClearFields_3_2");
} else {
return;
}
if( numberToClear > 2 ){
kClearFields_kernel<<sim.nonbond_blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_3->_pDevData );
LAUNCHERROR("kClearFields_3_3");
} else {
return;
}
if( numberToClear > 3 ){
kClearFields_kernel<<sim.nonbond_blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*3*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_4->_pDevData );
LAUNCHERROR("kClearFields_3_4");
}
}
// clear psWorkArray_1_1 & psWorkArray_1_2
void kClearFields_1( amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_kernel<<sim.nonbond_blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData );
LAUNCHERROR("kClearFields_1_1");
kClearFields_kernel<<sim.nonbond_blocks, gpu->sim.threads_per_block>>>( gpu->sim.paddedNumberOfAtoms*gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_2->_pDevData );
LAUNCHERROR("kClearFields_1_2");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut, int addTo )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
while (pos < fieldComponents)
{
float totalField = addTo ? fieldOut[pos] : 0.0f;
float* pFt = fieldIn + pos;
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
pFt += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt[0] + pFt[fieldComponents];
pFt += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt[0];
}
fieldOut[pos] = totalField;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceAndCombineFields_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2, float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
while (pos < fieldComponents)
{
float totalField = 0.0f;
float* pFt1 = fieldIn1 + pos;
float* pFt2 = fieldIn2 + pos;
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
totalField += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
pFt1 += fieldComponents*4;
pFt2 += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt1[0] + pFt1[fieldComponents];
totalField += pFt2[0] + pFt2[fieldComponents];
pFt1 += fieldComponents*2;
pFt2 += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt1[0];
totalField += pFt2[0];
}
fieldOut[pos] = totalField;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceFieldsToFloat4_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float4* field4Out )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
float* fieldOut = (float*) field4Out;
while (pos < fieldComponents)
{
float totalField = 0.0f;
float* pFt = fieldIn + pos;
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
pFt += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt[0] + pFt[fieldComponents];
pFt += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt[0];
}
unsigned int j = pos/3;
unsigned int k = pos - 3*j;
fieldOut[4*j+k] += totalField;
pos += gridDim.x * blockDim.x;
}
}