Commit 1010df33 authored by Peter Eastman's avatar Peter Eastman
Browse files

Checking in Cuda implementation of explicit solvent

parent df4b64cb
#--------------------------------------------------- #---------------------------------------------------
# OpenMM CUDA Platform # OpenMM CUDA Platform
# #
# Creates OpenMM library, base name=OpenMMCuda. # Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants # Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d). # are created for static (_static) and debug (_d).
# #
# Windows: # Windows:
# OpenMMCuda[_d].dll # OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib # OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib # OpenMMCuda_static[_d].lib
# Unix: # Unix:
# libOpenMMCuda[_d].so # libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a # libOpenMMCuda_static[_d].a
#---------------------------------------------------- #----------------------------------------------------
SUBDIRS (tests) SUBDIRS (tests)
# The source is organized into subdirectories, but we handle them all from # The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS. # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .) SET(OPENMM_SOURCE_SUBDIRS .)
# Collect up information about the version of the OpenMM library we're building # Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries. # and make it available to the code so it can be built into the binaries.
SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda) SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME}) SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static) SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
# Ensure that debug libraries have "_d" appended to their names. # Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition. # CMake gets this right on Windows automatically with this definition.
IF (${CMAKE_GENERATOR} MATCHES "Visual Studio") IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE) SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio") ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
# But on Unix or Cygwin we have to add the suffix manually # But on Unix or Cygwin we have to add the suffix manually
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(SHARED_TARGET ${SHARED_TARGET}_d) SET(SHARED_TARGET ${SHARED_TARGET}_d)
SET(STATIC_TARGET ${STATIC_TARGET}_d) SET(STATIC_TARGET ${STATIC_TARGET}_d)
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
# These are all the places to search for header files which are # These are all the places to search for header files which are
# to be part of the API. # to be part of the API.
SET(API_INCLUDE_DIRS) # start empty SET(API_INCLUDE_DIRS) # start empty
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
# append # append
SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS} SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal) ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
ENDFOREACH(subdir) ENDFOREACH(subdir)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS, # We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames. # and absolute pathnames.
SET(API_REL_INCLUDE_FILES) # start these out empty SET(API_REL_INCLUDE_FILES) # start these out empty
SET(API_ABS_INCLUDE_FILES) SET(API_ABS_INCLUDE_FILES)
FOREACH(dir ${API_INCLUDE_DIRS}) FOREACH(dir ${API_INCLUDE_DIRS})
FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames
SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths}) SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
FOREACH(pathname ${fullpaths}) FOREACH(pathname ${fullpaths})
GET_FILENAME_COMPONENT(filename ${pathname} NAME) GET_FILENAME_COMPONENT(filename ${pathname} NAME)
SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename}) SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
ENDFOREACH(pathname) ENDFOREACH(pathname)
ENDFOREACH(dir) ENDFOREACH(dir)
# collect up source files # collect up source files
SET(SOURCE_FILES) # empty SET(SOURCE_FILES) # empty
SET(SOURCE_INCLUDE_FILES) SET(SOURCE_INCLUDE_FILES)
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp) FILE(GLOB src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp)
FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h) FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append
SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files}) SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir) ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake) SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
SUBDIRS (sharedTarget staticTarget) IF (APPLE)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
ELSE (APPLE)
IF (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
ELSE (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
ENDIF (WIN32)
ENDIF(APPLE)
SUBDIRS (sharedTarget staticTarget)
...@@ -39,42 +39,40 @@ ...@@ -39,42 +39,40 @@
using namespace OpenMM; using namespace OpenMM;
StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const { StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == "particlePositions") { if (name == "particlePositions") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f}; float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
} }
if (name == "particleVelocities") { if (name == "particleVelocities") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f}; float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
} }
if (name == "particleForces") { if (name == "particleForces") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f}; float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
} }
switch (type) { switch (type) {
case Stream::Float: case Stream::Float:
case Stream::Double: case Stream::Double:
return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
case Stream::Float2: case Stream::Float2:
case Stream::Double2: case Stream::Double2:
return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
case Stream::Float3: case Stream::Float3:
case Stream::Double3: case Stream::Double3:
return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
case Stream::Float4: case Stream::Float4:
case Stream::Double4: case Stream::Double4:
return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
case Stream::Integer: case Stream::Integer:
return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
case Stream::Integer2: case Stream::Integer2:
return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
case Stream::Integer3: case Stream::Integer3:
return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
case Stream::Integer4: case Stream::Integer4:
return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
} }
throw OpenMMException("Tried to create a Stream with an illegal DataType."); throw OpenMMException("Tried to create a Stream with an illegal DataType.");
} }
...@@ -55,7 +55,7 @@ public: ...@@ -55,7 +55,7 @@ public:
private: private:
void initType(); void initType();
CUDAStream<T>* stream; CUDAStream<T>* stream;
_gpuContext* gpu; _gpuContext* gpu;
bool ownStream; bool ownStream;
int width, rowOffset; int width, rowOffset;
float paddingValues[4]; float paddingValues[4];
...@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() { ...@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template <class T> template <class T>
void CudaStreamImpl<T>::loadFromArray(const void* array) { void CudaStreamImpl<T>::loadFromArray(const void* array) {
float* data = reinterpret_cast<float*>(stream->_pSysData); float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) { if (baseType == Stream::Float) {
float* arrayData = (float*) array; float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = arrayData[i*width+j]; data[i*rowOffset+j] = arrayData[order[i]*width+j];
} }
else if (baseType == Stream::Double) { else if (baseType == Stream::Double) {
double* arrayData = (double*) array; double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j]; data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
} }
else { else {
int* arrayData = (int*) array; int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j]; data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
} }
for (int i = getSize(); i < (int) stream->_length; ++i) for (int i = getSize(); i < (int) stream->_length; ++i)
for (int j = 0; j < rowOffset; ++j) for (int j = 0; j < rowOffset; ++j)
data[i*rowOffset+j] = paddingValues[j]; data[i*rowOffset+j] = paddingValues[j];
stream->Upload(); stream->Upload();
// VisualStudio compiler did not like stream == gpu->psPosq4 // VisualStudio compiler did not like stream == gpu->psPosq4
//if( gpu && stream == gpu->psPosq4 ){ //if( gpu && stream == gpu->psPosq4 ){
if( gpu && getName() == "particlePositions" ){ if( gpu && getName() == "particlePositions" ){
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
} }
} }
template <class T> template <class T>
void CudaStreamImpl<T>::saveToArray(void* array) { void CudaStreamImpl<T>::saveToArray(void* array) {
stream->Download(); stream->Download();
float* data = reinterpret_cast<float*>(stream->_pSysData); float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) { if (baseType == Stream::Float) {
float* arrayData = (float*) array; float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j]; arrayData[order[i]*width+j] = data[i*rowOffset+j];
} }
else if (baseType == Stream::Double) { else if (baseType == Stream::Double) {
double* arrayData = (double*) array; double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j]; arrayData[order[i]*width+j] = data[i*rowOffset+j];
} }
else { else {
int* arrayData = (int*) array; int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = (int) data[i*rowOffset+j]; arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
} }
} }
......
...@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu); ...@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu); extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu); extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu); extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJForces_12(gpuContext gpu);
extern void kCalculateObcGbsaForces1(gpuContext gpu); extern void kCalculateObcGbsaForces1(gpuContext gpu);
extern void kCalculateObcGbsaForces1_12(gpuContext gpu); extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu); extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu); extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu); extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu); extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu); extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyFirstShake(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu); extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu); extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu); extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu); extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu); extern void kBrownianUpdatePart1(gpuContext gpu);
...@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu); ...@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers // Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu); extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu); extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu); extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu); extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu); extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu); extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu); extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
...@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu); ...@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu); extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu); extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu); extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu); extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu); extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu); extern void SetUpdateShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu); extern void GetUpdateShakeHSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu); extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu); extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu); extern void SetBrownianUpdateSim(gpuContext gpu);
......
...@@ -36,11 +36,12 @@ ...@@ -36,11 +36,12 @@
#include <limits> #include <limits>
#include <iostream> #include <iostream>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <builtin_types.h> #include <builtin_types.h>
#include <vector_functions.h> #include <vector_functions.h>
using namespace std;
#define RTERROR(status, s) \ #define RTERROR(status, s) \
if (status != cudaSuccess) { \ if (status != cudaSuccess) { \
...@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream() ...@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream()
template <typename T> template <typename T>
void CUDAStream<T>::Allocate() void CUDAStream<T>::Allocate()
{ {
cudaError_t status; cudaError_t status;
_pSysStream = new T*[_subStreams]; _pSysStream = new T*[_subStreams];
_pDevStream = new T*[_subStreams]; _pDevStream = new T*[_subStreams];
_pSysData = new T[_subStreams * _stride]; _pSysData = new T[_subStreams * _stride];
...@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384; ...@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220; static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC
};
struct cudaGmxSimulation { struct cudaGmxSimulation {
// Constants // Constants
...@@ -236,6 +243,7 @@ struct cudaGmxSimulation { ...@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned int blocks; // Number of blocks to launch across linear kernels unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1 unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2 unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
unsigned int threads_per_block; // Threads per block to launch unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
...@@ -245,12 +253,17 @@ struct cudaGmxSimulation { ...@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int interaction_threads_per_block; // Threads per block when identifying interacting tiles
unsigned int workUnits; // Number of work units unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units unsigned int* pWorkUnit; // Pointer to work units
unsigned int* pInteractingWorkUnit; // Pointer to work units that have interactions
unsigned int* pInteractionFlag; // Flags for which work units have interactions
size_t* pInteractionCount; // A count of the number of work units which have interactions
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1 unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM unsigned int workUnitsPerSM; // Number of workblocks per SM
...@@ -270,6 +283,12 @@ struct cudaGmxSimulation { ...@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned int outputBuffers; // Number of output buffers unsigned int outputBuffers; // Number of output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
float nonbondedCutoffSqr; // Cutoff distance for CDLJ calculations
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float reactionFieldK; // Constant for reaction field correction
float probeRadius; // SASA probe radius float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant float electricConstant; // ACE approximation electric constant
...@@ -326,6 +345,7 @@ struct cudaGmxSimulation { ...@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int NonShakeConstraints; // Total number of NonShake atoms unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system unsigned int degreesOfFreedom; // Number of degrees of freedom in system
...@@ -334,12 +354,17 @@ struct cudaGmxSimulation { ...@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int* pNonShakeID; // Not Shaking atoms int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data unsigned int* pExclusion; // Nonbond exclusion data
unsigned int bond_offset; // Offset to end of bonds unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
// Mutable stuff // Mutable stuff
float4* pPosq; // Pointer to atom positions and charges float4* pPosq; // Pointer to atom positions and charges
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <cuda.h> #include <cuda.h>
#include <vector_functions.h> #include <vector_functions.h>
#include <cstdlib> #include <cstdlib>
...@@ -40,6 +41,7 @@ ...@@ -40,6 +41,7 @@
#include <ctime> #include <ctime>
#include <cmath> #include <cmath>
#include <map> #include <map>
#include <algorithm>
#ifdef WIN32 #ifdef WIN32
#include <windows.h> #include <windows.h>
#else #else
...@@ -148,6 +150,23 @@ struct ShakeCluster { ...@@ -148,6 +150,23 @@ struct ShakeCluster {
} }
}; };
struct Constraint
{
Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
}
int atom1, atom2;
float distance2;
};
struct Molecule {
vector<int> atoms;
vector<int> bonds;
vector<int> angles;
vector<int> periodicTorsions;
vector<int> rbTorsions;
vector<int> constraints;
};
static const float dielectricOffset = 0.009f; static const float dielectricOffset = 0.009f;
static const float PI = 3.1415926535f; static const float PI = 3.1415926535f;
static const float probeRadius = 0.14f; static const float probeRadius = 0.14f;
...@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname) ...@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
} }
} }
cout << total_exclusions << " total exclusions.\n"; cout << total_exclusions << " total exclusions.\n";
gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions); gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions, NO_CUTOFF);
gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale); gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
return coulombs; return coulombs;
} }
...@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname) ...@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
extern "C" extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q, void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
const vector<char>& symbol, const vector<vector<int> >& exclusions) const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
{ {
unsigned int coulombs = atom.size(); unsigned int coulombs = atom.size();
gpu->sim.epsfac = epsfac; gpu->sim.epsfac = epsfac;
unsigned int total_exclusions = 0; gpu->sim.nonbondedMethod = method;
gpu->exclusions = exclusions;
for (unsigned int i = 0; i < coulombs; i++) for (unsigned int i = 0; i < coulombs; i++)
{ {
...@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at ...@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu->psPosq4->_pSysStream[0][i].w = p0; gpu->psPosq4->_pSysStream[0][i].w = p0;
gpu->psSigEps2->_pSysStream[0][i].x = p1; gpu->psSigEps2->_pSysStream[0][i].x = p1;
gpu->psSigEps2->_pSysStream[0][i].y = p2; gpu->psSigEps2->_pSysStream[0][i].y = p2;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
gpu->psPosq4->_pSysStream[0][i].w << " " <<
gpu->psSigEps2->_pSysStream[0][i].x << " " <<
gpu->psSigEps2->_pSysStream[0][i].y << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
exclusions;
#endif
for (int j = 0; j < (int) exclusions[i].size(); j++)
{
#if (DUMP_PARAMETERS == 1)
cout << " " << exclusions[i][j];
#endif
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
if (i >= (int) exclusions[i][j])
{
total_exclusions++;
}
}
#if (DUMP_PARAMETERS == 1)
cout << endl;
#endif
} }
// Dummy out extra atom data // Dummy out extra atom data
...@@ -868,36 +860,31 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at ...@@ -868,36 +860,31 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu->psSigEps2->_pSysStream[0][i].y = 0.0f; gpu->psSigEps2->_pSysStream[0][i].y = 0.0f;
} }
// Add in remaining exclusions
for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
{
for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
{
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
}
}
gpu->psPosq4->Upload(); gpu->psPosq4->Upload();
gpu->psSigEps2->Upload(); gpu->psSigEps2->Upload();
}
// Check for exclusion consistency extern "C"
for (unsigned int i = 0; i < coulombs; i++) void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
{ {
for (unsigned int j = i; j < coulombs; j++) gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
{ gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
}
if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i]) extern "C"
cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl; void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
} {
} gpu->sim.periodicBoxSizeX = xsize;
gpu->sim.periodicBoxSizeY = ysize;
gpu->sim.periodicBoxSizeZ = zsize;
} }
extern "C" extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, const vector<float>& scale) void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, const vector<float>& scale)
{ {
unsigned int atoms = atom.size(); unsigned int atoms = atom.size();
gpu->bIncludeGBSA = true;
for (unsigned int i = 0; i < atoms; i++) for (unsigned int i = 0; i < atoms; i++)
{ {
gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset; gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
...@@ -973,11 +960,91 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto ...@@ -973,11 +960,91 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
constraintCount[atom1[i]]++; constraintCount[atom1[i]]++;
constraintCount[atom2[i]]++; constraintCount[atom2[i]]++;
} }
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
vector<map<int, float> > settleConstraints(gpu->natoms);
for (int i = 0; i < atom1.size(); i++) {
if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
settleConstraints[atom1[i]][atom2[i]] = distance[i];
settleConstraints[atom2[i]][atom1[i]] = distance[i];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector<int> settleClusters;
for (int i = 0; i < settleConstraints.size(); i++) {
if (settleConstraints[i].size() == 2) {
int partner1 = settleConstraints[i].begin()->first;
int partner2 = (++settleConstraints[i].begin())->first;
if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
settleConstraints[i].clear();
else if (i < partner1 && i < partner2)
settleClusters.push_back(i);
}
else
settleConstraints[i].clear();
}
// Record the actual SETTLE clusters.
CUDAStream<int4>* psSettleID = new CUDAStream<int4>((int) settleClusters.size(), 1);
gpu->psSettleID = psSettleID;
gpu->sim.pSettleID = psSettleID->_pDevStream[0];
CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1);
gpu->psSettleParameter = psSettleParameter;
gpu->sim.pSettleParameter = psSettleParameter->_pDevStream[0];
gpu->sim.settleConstraints = settleClusters.size();
for (int i = 0; i < settleClusters.size(); i++) {
int atom1 = settleClusters[i];
int atom2 = settleConstraints[atom1].begin()->first;
int atom3 = (++settleConstraints[atom1].begin())->first;
float dist12 = settleConstraints[atom1].find(atom2)->second;
float dist13 = settleConstraints[atom1].find(atom3)->second;
float dist23 = settleConstraints[atom2].find(atom3)->second;
if (dist12 == dist13) { // atom1 is the central atom
psSettleID->_pSysData[i].x = atom1;
psSettleID->_pSysData[i].y = atom2;
psSettleID->_pSysData[i].z = atom3;
psSettleParameter->_pSysData[i].x = dist12;
psSettleParameter->_pSysData[i].y = dist23;
}
else if (dist12 == dist23) { // atom2 is the central atom
psSettleID->_pSysData[i].x = atom2;
psSettleID->_pSysData[i].y = atom1;
psSettleID->_pSysData[i].z = atom3;
psSettleParameter->_pSysData[i].x = dist12;
psSettleParameter->_pSysData[i].y = dist13;
}
else if (dist13 == dist23) { // atom3 is the central atom
psSettleID->_pSysData[i].x = atom3;
psSettleID->_pSysData[i].y = atom1;
psSettleID->_pSysData[i].z = atom2;
psSettleParameter->_pSysData[i].x = dist13;
psSettleParameter->_pSysData[i].y = dist12;
}
else
throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
}
psSettleID->Upload();
psSettleParameter->Upload();
gpu->sim.settle_threads_per_block = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.settle_threads_per_block < 1)
gpu->sim.settle_threads_per_block = 1;
// Find clusters consisting of a central atom with up to three peripheral atoms. // Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters; map<int, ShakeCluster> clusters;
for (int i = 0; i < atom1.size(); i++) { for (int i = 0; i < atom1.size(); i++) {
if (settleConstraints[atom1[i]].size() == 2)
continue; // This is being taken care of with SETTLE.
// Determine which is the central atom. // Determine which is the central atom.
bool firstIsCentral; bool firstIsCentral;
...@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu) ...@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->sim.degreesOfFreedom = 3 * gpu->sim.atoms - 6; gpu->sim.degreesOfFreedom = 3 * gpu->sim.atoms - 6;
gpu->gpAtomTable = NULL; gpu->gpAtomTable = NULL;
gpu->gAtomTypes = 0; gpu->gAtomTypes = 0;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
gpu->sim.totalNonbondOutputBuffers = 2 * gpu->sim.nonbondOutputBuffers;
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
gpu->psPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1); gpu->psPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.stride = gpu->psPosq4->_stride; gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = gpu->sim.stride * 2; gpu->sim.stride2 = gpu->sim.stride * 2;
...@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu) ...@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1); gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0]; gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->pAtomSymbol = new unsigned char[gpu->natoms]; gpu->pAtomSymbol = new unsigned char[gpu->natoms];
gpu->psAtomIndex = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pAtomIndex = gpu->psAtomIndex->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
gpu->psAtomIndex->_pSysStream[0][i] = i;
gpu->psAtomIndex->Upload();
// Determine randoms // Determine randoms
gpu->seed = (unsigned long)time(NULL) & 0x000fffff; gpu->seed = (unsigned long)time(NULL) & 0x000fffff;
gpu->sim.randomFrames = 995; gpu->sim.randomFrames = 95;
gpu->sim.randomIterations = gpu->sim.randomFrames; gpu->sim.randomIterations = gpu->sim.randomFrames;
gpu->sim.randoms = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID; gpu->sim.randoms = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
gpu->sim.totalRandoms = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms; gpu->sim.totalRandoms = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
...@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms) ...@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
{ {
sscanf(pAdapter, "%d", &device); sscanf(pAdapter, "%d", &device);
} }
cudaError_t status = cudaSetDevice(device); // cudaError_t status = cudaSetDevice(device);
RTERROR(status, "Error setting CUDA device") // RTERROR(status, "Error setting CUDA device")
// Determine which core to run on // Determine which core to run on
#if 0 #if 0
...@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms) ...@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
gpu->sim.probeRadius = probeRadius; gpu->sim.probeRadius = probeRadius;
gpu->sim.surfaceAreaFactor = surfaceAreaFactor; gpu->sim.surfaceAreaFactor = surfaceAreaFactor;
gpu->sim.electricConstant = electricConstant; gpu->sim.electricConstant = electricConstant;
gpu->sim.nonbondedMethod = NO_CUTOFF;
gpu->sim.nonbondedCutoffSqr = 0.0f;
gpu->sim.bigFloat = 99999999.0f; gpu->sim.bigFloat = 99999999.0f;
gpu->sim.forceConversionFactor = forceConversionFactor; gpu->sim.forceConversionFactor = forceConversionFactor;
...@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms) ...@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
gpu->bCalculateCM = false; gpu->bCalculateCM = false;
gpu->bRemoveCM = false; gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false;
gpuInitializeRandoms(gpu); gpuInitializeRandoms(gpu);
// To be determined later // To be determined later
...@@ -1489,18 +1560,20 @@ void* gpuInit(int numAtoms) ...@@ -1489,18 +1560,20 @@ void* gpuInit(int numAtoms)
gpu->psLJ14Parameter = NULL; gpu->psLJ14Parameter = NULL;
gpu->psShakeID = NULL; gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL; gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL;
gpu->psSettleParameter = NULL;
gpu->psExclusion = NULL; gpu->psExclusion = NULL;
gpu->psWorkUnit = NULL; gpu->psWorkUnit = NULL;
gpu->psInteractingWorkUnit = NULL;
gpu->psInteractionFlag = NULL;
gpu->psInteractionCount = NULL;
gpu->psGridBoundingBox = NULL;
gpu->psGridCenter = NULL;
// Initialize output buffer before reading parameters // Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms]; gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int)); memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
// Initialize exclusion array
gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
gpu->pExclusion[i] = 1;
return (void*)gpu; return (void*)gpu;
} }
...@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu) ...@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
{ {
// Delete sysmem pointers // Delete sysmem pointers
delete[] gpu->pOutputBufferCounter; delete[] gpu->pOutputBufferCounter;
delete[] gpu->pExclusion;
delete[] gpu->gpAtomTable; delete[] gpu->gpAtomTable;
delete[] gpu->pAtomSymbol; delete[] gpu->pAtomSymbol;
...@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu) ...@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psLJ14Parameter; delete gpu->psLJ14Parameter;
delete gpu->psShakeID; delete gpu->psShakeID;
delete gpu->psShakeParameter; delete gpu->psShakeParameter;
delete gpu->psSettleID;
delete gpu->psSettleParameter;
delete gpu->psExclusion; delete gpu->psExclusion;
delete gpu->psWorkUnit; delete gpu->psWorkUnit;
delete gpu->psInteractingWorkUnit;
delete gpu->psInteractionFlag;
delete gpu->psInteractionCount;
delete gpu->psRandom4; delete gpu->psRandom4;
delete gpu->psRandom2; delete gpu->psRandom2;
delete gpu->psRandomPosition; delete gpu->psRandomPosition;
delete gpu->psRandomSeed; delete gpu->psRandomSeed;
delete gpu->psLinearMomentum; delete gpu->psLinearMomentum;
delete gpu->psAtomIndex;
delete gpu->psGridBoundingBox;
delete gpu->psGridCenter;
if (gpu->cudpp != 0)
cudppDestroyPlan(gpu->cudpp);
// Wrap up // Wrap up
delete gpu; delete gpu;
...@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu) ...@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
extern "C" extern "C"
int gpuBuildOutputBuffers(gpuContext gpu) int gpuBuildOutputBuffers(gpuContext gpu)
{ {
// Select the number of output buffer to use.
gpu->bOutputBufferPerWarp = true;
gpu->sim.nonbondOutputBuffers = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
{
// For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
gpu->bOutputBufferPerWarp = false;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
}
gpu->sim.totalNonbondOutputBuffers = (gpu->bIncludeGBSA ? 2 * gpu->sim.nonbondOutputBuffers : gpu->sim.nonbondOutputBuffers);
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers; unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers;
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++) for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
{ {
...@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu) ...@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
const unsigned int grid = gpu->grid; const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms + (grid - 1)) / grid; const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2; const unsigned int cells = dim * (dim + 1) / 2;
const unsigned int* pExclusion = gpu->pExclusion;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u); CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
unsigned int* pWorkList = psWorkUnit->_pSysStream[0]; unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
gpu->psWorkUnit = psWorkUnit; gpu->psWorkUnit = psWorkUnit;
gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0]; gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
gpu->psInteractingWorkUnit = psInteractingWorkUnit;
gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u);
gpu->psInteractionFlag = psInteractionFlag;
gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u);
gpu->psInteractionCount = psInteractionCount;
gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u);
gpu->psGridBoundingBox = psGridBoundingBox;
gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u);
gpu->psGridCenter = psGridCenter;
gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
gpu->sim.nonbond_workBlock = gpu->sim.nonbond_threads_per_block / GRID; gpu->sim.nonbond_workBlock = gpu->sim.nonbond_threads_per_block / GRID;
gpu->sim.bornForce2_workBlock = gpu->sim.bornForce2_threads_per_block / GRID; gpu->sim.bornForce2_workBlock = gpu->sim.bornForce2_threads_per_block / GRID;
gpu->sim.workUnits = cells; gpu->sim.workUnits = cells;
// Increase block count if necessary for extra large molecules that would // Initialize the CUDPP workspace.
// otherwise overflow the SM workunit buffers gpu->cudpp = 0;
int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM; CUDPPConfiguration config;
if ((int) gpu->sim.nonbond_blocks < minimumBlocks) config.datatype = CUDPP_UINT;
config.algorithm = CUDPP_COMPACT;
config.options = CUDPP_OPTION_FORWARD;
CUDPPResult result = cudppPlan(&gpu->cudpp, config, cells, 1, 0);
if (CUDPP_SUCCESS != result)
{ {
gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks); printf("Error initializing CUDPP: %d\n", result);
} exit(-1);
if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
{
gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
} }
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
// if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
// {
// gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
// }
// if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// {
// gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
// }
gpu->sim.nbWorkUnitsPerBlock = cells / gpu->sim.nonbond_blocks; gpu->sim.nbWorkUnitsPerBlock = cells / gpu->sim.nonbond_blocks;
gpu->sim.nbWorkUnitsPerBlockRemainder = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock; gpu->sim.nbWorkUnitsPerBlockRemainder = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
gpu->sim.bf2WorkUnitsPerBlock = cells / gpu->sim.bornForce2_blocks; gpu->sim.bf2WorkUnitsPerBlock = cells / gpu->sim.bornForce2_blocks;
gpu->sim.bf2WorkUnitsPerBlockRemainder = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock; gpu->sim.bf2WorkUnitsPerBlockRemainder = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
gpu->sim.interaction_threads_per_block = 64;
gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;
// Decrease thread count for extra small molecules to spread computation // Decrease thread count for extra small molecules to spread computation
// across entire chip // across entire chip
...@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu) ...@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
for (unsigned int x = y; x < dim; x++) for (unsigned int x = y; x < dim; x++)
{ {
pWorkList[count] = (x << 17) | (y << 2); pWorkList[count] = (x << 17) | (y << 2);
// Check for exclusions
int exclusions = 0;
for (unsigned int i = y * grid; i < y * grid + grid; i++)
{
for (unsigned int j = x * grid; j < x * grid + grid; j++)
{
if (!pExclusion[i * atoms + j])
{
exclusions++;
}
}
}
// Signal exclusions if they exist
if (exclusions > 0)
pWorkList[count] |= 0x1;
count++; count++;
} }
} }
...@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu) ...@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
} }
extern "C" extern "C"
int gpuBuildExclusionList(gpuContext gpu) void gpuBuildExclusionList(gpuContext gpu)
{ {
unsigned int atoms = gpu->sim.paddedNumberOfAtoms; const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u); const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms+(grid-1))/grid;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>((atoms*atoms+grid-1) / grid, 1u);
gpu->psExclusion = psExclusion; gpu->psExclusion = psExclusion;
gpu->sim.pExclusion = psExclusion->_pDevStream[0]; gpu->sim.pExclusion = psExclusion->_pDevStream[0];
unsigned int* pExList = psExclusion->_pSysStream[0]; unsigned int* pExList = psExclusion->_pSysStream[0];
int exclusions = 0; unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
unsigned int pos = 0; for (int i = 0; i < psExclusion->_length; ++i)
pExList[i] = 0xFFFFFFFF;
// Fill in the exclusions.
for (unsigned int x = 0; x < atoms; x += gpu->grid) for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
{ {
for (unsigned int y = 0; y < atoms; y += gpu->grid) int x = atom1/grid;
{ int offset = atom1-x*grid;
for (unsigned x1 = x; x1 < x + gpu->grid; x1++) for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
{ {
unsigned int mask = 0; int atom2 = gpu->exclusions[atom1][j];
for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++) int y = atom2/grid;
{ int index = x*atoms+y*grid+offset;
mask >>= 1; pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
if (gpu->pExclusion[x1 * atoms + y1] == 0) int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
{ pWorkList[cell] |= 1;
if (x1 >= y1) }
exclusions++; }
}
else // Mark all interactions that involve a padding atom as being excluded.
mask |= 0x80000000;
} for (int atom1 = gpu->natoms; atom1 < atoms; ++atom1)
pExList[pos++] = mask; {
} int x = atom1/grid;
int offset1 = atom1-x*grid;
for (int atom2 = 0; atom2 < atoms; ++atom2)
{
int y = atom2/grid;
int index = x*atoms+y*grid+offset1;
pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
int offset2 = atom2-y*grid;
index = y*atoms+x*grid+offset2;
pExList[index] &= 0xFFFFFFFF-(1<<(atom1-x*grid));
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
} }
} }
psExclusion->Upload(); psExclusion->Upload();
gpu->psWorkUnit->Upload();
gpuSetConstants(gpu); gpuSetConstants(gpu);
return exclusions;
} }
extern "C" extern "C"
...@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu) ...@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
SetUpdateShakeHSim(gpu); SetUpdateShakeHSim(gpu);
SetVerletUpdateSim(gpu); SetVerletUpdateSim(gpu);
SetBrownianUpdateSim(gpu); SetBrownianUpdateSim(gpu);
SetSettleSim(gpu);
SetRandomSim(gpu); SetRandomSim(gpu);
if (gpu->sm_version >= SM_12) if (gpu->sm_version >= SM_12)
{ {
SetCalculateCDLJForces_12Sim(gpu);
SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces1_12Sim(gpu); SetCalculateObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces2_12Sim(gpu);
} }
return 1; return 1;
...@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu) ...@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
); );
} }
} }
static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
static void findMoleculeGroups(gpuContext gpu)
{
// First make a list of constraints for future use.
vector<Constraint> constraints;
for (int i = 0; i < gpu->sim.ShakeConstraints; i++)
{
int atom1 = gpu->psShakeID->_pSysData[i].x;
int atom2 = gpu->psShakeID->_pSysData[i].y;
int atom3 = gpu->psShakeID->_pSysData[i].z;
int atom4 = gpu->psShakeID->_pSysData[i].w;
float distance2 = gpu->psShakeParameter->_pSysData[i].z;
constraints.push_back(Constraint(atom1, atom2, distance2));
if (atom3 != -1)
constraints.push_back(Constraint(atom1, atom3, distance2));
if (atom4 != -1)
constraints.push_back(Constraint(atom1, atom3, distance2));
}
for (int i = 0; i < gpu->sim.settleConstraints; i++)
{
int atom1 = gpu->psSettleID->_pSysData[i].x;
int atom2 = gpu->psSettleID->_pSysData[i].y;
int atom3 = gpu->psSettleID->_pSysData[i].z;
float distance12 = gpu->psSettleParameter->_pSysData[i].x;
float distance23 = gpu->psSettleParameter->_pSysData[i].y;
constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
}
// First make a list of every other atom to which each atom is connect by a bond or constraint.
int numAtoms = gpu->natoms;
vector<vector<int> > atomBonds(numAtoms);
for (int i = 0; i < gpu->sim.bonds; i++)
{
int atom1 = gpu->psBondID->_pSysData[i].x;
int atom2 = gpu->psBondID->_pSysData[i].y;
atomBonds[atom1].push_back(atom2);
atomBonds[atom2].push_back(atom1);
}
for (int i = 0; i < constraints.size(); i++)
{
int atom1 = constraints[i].atom1;
int atom2 = constraints[i].atom2;
atomBonds[atom1].push_back(atom2);
atomBonds[atom2].push_back(atom1);
}
// Now tag atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
// Construct a description of each molecule.
vector<Molecule> molecules(numMolecules);
for (int i = 0; i < numMolecules; i++)
molecules[i].atoms = atomIndices[i];
for (int i = 0; i < gpu->sim.bonds; i++)
{
int atom1 = gpu->psBondID->_pSysData[i].x;
molecules[atomMolecule[atom1]].bonds.push_back(i);
}
for (int i = 0; i < gpu->sim.bond_angles; i++)
{
int atom1 = gpu->psBondAngleID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].angles.push_back(i);
}
for (int i = 0; i < gpu->sim.dihedrals; i++)
{
int atom1 = gpu->psDihedralID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].periodicTorsions.push_back(i);
}
for (int i = 0; i < gpu->sim.rb_dihedrals; i++)
{
int atom1 = gpu->psRbDihedralID1->_pSysData[i].x;
molecules[atomMolecule[atom1]].rbTorsions.push_back(i);
}
for (int i = 0; i < constraints.size(); i++)
{
molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
}
// Sort them into groups of identical molecules.
vector<Molecule> uniqueMolecules;
vector<vector<int> > moleculeInstances;
for (int molIndex = 0; molIndex < molecules.size(); molIndex++)
{
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < uniqueMolecules.size() && isNew; j++)
{
Molecule& mol2 = uniqueMolecules[j];
bool identical = true;
if (mol.atoms.size() != mol2.atoms.size() || mol.bonds.size() != mol2.bonds.size()
|| mol.angles.size() != mol2.angles.size() || mol.periodicTorsions.size() != mol2.periodicTorsions.size()
|| mol.rbTorsions.size() != mol2.rbTorsions.size() || mol.constraints.size() != mol2.constraints.size())
identical = false;
int atomOffset = mol2.atoms[0]-mol.atoms[0];
float4* posq = gpu->psPosq4->_pSysData;
float4* velm = gpu->psVelm4->_pSysData;
float2* sigeps = gpu->psSigEps2->_pSysData;
for (int i = 0; i < mol.atoms.size() && identical; i++)
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || posq[mol.atoms[i]].w != posq[mol2.atoms[i]].w ||
velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w || sigeps[mol.atoms[i]].x != sigeps[mol2.atoms[i]].x ||
sigeps[mol.atoms[i]].y != sigeps[mol2.atoms[i]].y)
identical = false;
int4* bondID = gpu->psBondID->_pSysData;
float2* bondParam = gpu->psBondParameter->_pSysData;
for (int i = 0; i < mol.bonds.size() && identical; i++)
if (bondID[mol.bonds[i]].x != bondID[mol2.bonds[i]].x-atomOffset || bondID[mol.bonds[i]].y != bondID[mol2.bonds[i]].y-atomOffset ||
bondParam[mol.bonds[i]].x != bondParam[mol2.bonds[i]].x || bondParam[mol.bonds[i]].y != bondParam[mol2.bonds[i]].y)
identical = false;
int4* angleID = gpu->psBondAngleID1->_pSysData;
float2* angleParam = gpu->psBondAngleParameter->_pSysData;
for (int i = 0; i < mol.angles.size() && identical; i++)
if (angleID[mol.angles[i]].x != angleID[mol2.angles[i]].x-atomOffset ||
angleID[mol.angles[i]].y != angleID[mol2.angles[i]].y-atomOffset ||
angleID[mol.angles[i]].z != angleID[mol2.angles[i]].z-atomOffset ||
angleParam[mol.angles[i]].x != angleParam[mol2.angles[i]].x ||
angleParam[mol.angles[i]].y != angleParam[mol2.angles[i]].y)
identical = false;
int4* periodicID = gpu->psDihedralID1->_pSysData;
float4* periodicParam = gpu->psDihedralParameter->_pSysData;
for (int i = 0; i < mol.periodicTorsions.size() && identical; i++)
if (periodicID[mol.periodicTorsions[i]].x != periodicID[mol2.periodicTorsions[i]].x-atomOffset ||
periodicID[mol.periodicTorsions[i]].y != periodicID[mol2.periodicTorsions[i]].y-atomOffset ||
periodicID[mol.periodicTorsions[i]].z != periodicID[mol2.periodicTorsions[i]].z-atomOffset ||
periodicID[mol.periodicTorsions[i]].w != periodicID[mol2.periodicTorsions[i]].w-atomOffset ||
periodicParam[mol.periodicTorsions[i]].x != periodicParam[mol2.periodicTorsions[i]].x ||
periodicParam[mol.periodicTorsions[i]].y != periodicParam[mol2.periodicTorsions[i]].y ||
periodicParam[mol.periodicTorsions[i]].z != periodicParam[mol2.periodicTorsions[i]].z)
identical = false;
int4* rbID = gpu->psRbDihedralID1->_pSysData;
float4* rbParam1 = gpu->psRbDihedralParameter1->_pSysData;
float2* rbParam2 = gpu->psRbDihedralParameter2->_pSysData;
for (int i = 0; i < mol.rbTorsions.size() && identical; i++)
if (rbID[mol.rbTorsions[i]].x != rbID[mol2.rbTorsions[i]].x-atomOffset ||
rbID[mol.rbTorsions[i]].y != rbID[mol2.rbTorsions[i]].y-atomOffset ||
rbID[mol.rbTorsions[i]].z != rbID[mol2.rbTorsions[i]].z-atomOffset ||
rbID[mol.rbTorsions[i]].w != rbID[mol2.rbTorsions[i]].w-atomOffset ||
rbParam1[mol.rbTorsions[i]].x != rbParam1[mol2.rbTorsions[i]].x ||
rbParam1[mol.rbTorsions[i]].y != rbParam1[mol2.rbTorsions[i]].y ||
rbParam1[mol.rbTorsions[i]].z != rbParam1[mol2.rbTorsions[i]].z ||
rbParam1[mol.rbTorsions[i]].w != rbParam1[mol2.rbTorsions[i]].w ||
rbParam2[mol.rbTorsions[i]].x != rbParam2[mol2.rbTorsions[i]].x ||
rbParam2[mol.rbTorsions[i]].y != rbParam2[mol2.rbTorsions[i]].y)
identical = false;
for (int i = 0; i < mol.constraints.size() && identical; i++)
if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
identical = false;
if (identical)
{
moleculeInstances[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew)
{
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector<int>());
moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
}
}
gpu->moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < moleculeInstances.size(); i++)
{
gpu->moleculeGroups[i].instances = moleculeInstances[i];
vector<int>& atoms = uniqueMolecules[i].atoms;
gpu->moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < atoms.size(); j++)
gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
extern "C"
void gpuReorderAtoms(gpuContext gpu)
{
if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
return;
if (gpu->moleculeGroups.size() == 0)
findMoleculeGroups(gpu);
// Find the range of positions and the number of bins along each axis.
int numAtoms = gpu->natoms;
gpu->psPosq4->Download();
gpu->psVelm4->Download();
float4* posq = gpu->psPosq4->_pSysData;
float4* velm = gpu->psVelm4->_pSysData;
float minx = posq[0].x, maxx = posq[0].x;
float miny = posq[0].y, maxy = posq[0].y;
float minz = posq[0].z, maxz = posq[0].z;
if (gpu->sim.nonbondedMethod == PERIODIC)
{
minx = miny = minz = 0.0;
maxx = gpu->sim.periodicBoxSizeX;
maxy = gpu->sim.periodicBoxSizeY;
maxz = gpu->sim.periodicBoxSizeZ;
}
else
{
for (int i = 1; i < numAtoms; i++)
{
minx = min(minx, posq[i].x);
maxx = max(maxx, posq[i].x);
miny = min(miny, posq[i].y);
maxy = max(maxy, posq[i].y);
minz = min(minz, posq[i].z);
maxz = max(maxz, posq[i].z);
}
}
float binWidth = 0.2*sqrt(gpu->sim.nonbondedCutoffSqr);
int xbins = 1 + (int) ((maxx-minx)/binWidth);
int ybins = 1 + (int) ((maxy-miny)/binWidth);
int zbins = 1 + (int) ((maxz-minz)/binWidth);
// Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms);
vector<float4> newPosq(numAtoms);
vector<float4> newVelm(numAtoms);
for (int group = 0; group < gpu->moleculeGroups.size(); group++)
{
// Find the center of each molecule.
gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
int numMolecules = mol.instances.size();
vector<int>& atoms = mol.atoms;
vector<float3> molPos(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < atoms.size(); j++)
{
int atom = atoms[j]+mol.instances[i];
molPos[i].x += posq[atom].x;
molPos[i].y += posq[atom].y;
molPos[i].z += posq[atom].z;
}
molPos[i].x /= atoms.size();
molPos[i].y /= atoms.size();
molPos[i].z /= atoms.size();
}
if (gpu->sim.nonbondedMethod == PERIODIC)
{
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++)
{
molPos[i].x -= floor(molPos[i].x/gpu->sim.periodicBoxSizeX)*gpu->sim.periodicBoxSizeX;
molPos[i].y -= floor(molPos[i].y/gpu->sim.periodicBoxSizeY)*gpu->sim.periodicBoxSizeY;
molPos[i].z -= floor(molPos[i].z/gpu->sim.periodicBoxSizeZ)*gpu->sim.periodicBoxSizeZ;
}
}
// Select a bin for each molecule, then sort them by bin.
vector<pair<int, int> > molBins(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
int x = (int) ((molPos[i].x-minx)/binWidth);
int y = (int) ((molPos[i].y-miny)/binWidth);
int z = (int) ((molPos[i].z-minz)/binWidth);
int yodd = y&1;
int zodd = z&1;
int bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
molBins[i] = pair<int, int>(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++)
{
for (int j = 0; j < atoms.size(); j++)
{
int oldIndex = mol.instances[molBins[i].second]+atoms[j];
int newIndex = mol.instances[i]+atoms[j];
originalIndex[newIndex] = gpu->psAtomIndex->_pSysStream[0][oldIndex];
newPosq[newIndex] = posq[oldIndex];
newVelm[newIndex] = velm[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++)
posq[i] = newPosq[i];
gpu->psPosq4->Upload();
for (int i = 0; i < numAtoms; i++)
velm[i] = newVelm[i];
gpu->psVelm4->Upload();
for (int i = 0; i < numAtoms; i++)
gpu->psAtomIndex->_pSysData[i] = originalIndex[i];
gpu->psAtomIndex->Upload();
}
...@@ -33,14 +33,20 @@ ...@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "cudatypes.h" #include "cudatypes.h"
#include "cudpp.h"
#include <vector> #include <vector>
struct gpuAtomType { struct gpuAtomType {
string name; std::string name;
char symbol; char symbol;
float r; float r;
}; };
struct gpuMoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
};
enum SM_VERSION enum SM_VERSION
{ {
SM_10, SM_10,
...@@ -61,8 +67,9 @@ struct _gpuContext { ...@@ -61,8 +67,9 @@ struct _gpuContext {
int gAtomTypes; int gAtomTypes;
cudaGmxSimulation sim; cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter; unsigned int* pOutputBufferCounter;
unsigned int* pExclusion; std::vector<std::vector<int> > exclusions;
unsigned char* pAtomSymbol; unsigned char* pAtomSymbol;
std::vector<gpuMoleculeGroup> moleculeGroups;
float iterations; float iterations;
float epsfac; float epsfac;
float solventDielectric; float solventDielectric;
...@@ -70,9 +77,12 @@ struct _gpuContext { ...@@ -70,9 +77,12 @@ struct _gpuContext {
int grid; int grid;
bool bCalculateCM; bool bCalculateCM;
bool bRemoveCM; bool bRemoveCM;
bool bRecalculateBornRadii; bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
CUDPPHandle cudpp;
CUDAStream<float4>* psPosq4; CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4; CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4; CUDAStream<float4>* psOldPosq4;
...@@ -103,15 +113,21 @@ struct _gpuContext { ...@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream<int>* psNonShakeID; CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID; CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter; CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion; CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psWorkUnit; CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
CUDAStream<size_t>* psInteractionCount;
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
CUDAStream<int>* psAtomIndex; // The original index of each atom
CUDAStream<float4>* psGridBoundingBox; // The size of each grid cell
CUDAStream<float4>* psGridCenter; // The center and radius for each grid cell
}; };
typedef struct _gpuContext *gpuContext; typedef struct _gpuContext *gpuContext;
...@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std:: ...@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2); const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C" extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s); float gpuGetAtomicRadius(gpuContext gpu, std::string s);
extern "C" extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s); unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);
extern "C" extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname); int gpuReadAtomicParameters(gpuContext gpu, char* fname);
...@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname); ...@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern "C" extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q, void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions); const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
extern "C" extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale); void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
...@@ -227,7 +249,7 @@ extern "C" ...@@ -227,7 +249,7 @@ extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu); int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C" extern "C"
int gpuBuildExclusionList(gpuContext gpu); void gpuBuildExclusionList(gpuContext gpu);
extern "C" extern "C"
int gpuSetConstants(gpuContext gpu); int gpuSetConstants(gpuContext gpu);
...@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu); ...@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern "C" extern "C"
void gpuDumpObcLoop1(gpuContext gpu); void gpuDumpObcLoop1(gpuContext gpu);
extern "C"
void gpuReorderAtoms(gpuContext gpu);
#endif //__GPUTYPES_H__ #endif //__GPUTYPES_H__
...@@ -54,335 +54,120 @@ struct Atom { ...@@ -54,335 +54,120 @@ struct Atom {
float fx; float fx;
float fy; float fy;
float fz; float fz;
float eps2;
float sig2;
}; };
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForcesSim(gpuContext gpu) void SetCalculateCDLJForcesSim(gpuContext gpu)
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
} }
void GetCalculateCDLJForcesSim(gpuContext gpu) void GetCalculateCDLJForcesSim(gpuContext gpu)
{ {
cudaError_t status; cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kCalculateCDLJForces_kernel() // Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu)
{ {
// Read queue of work blocks once so the remainder of // printf("kCalculateCDLJCutoffForces\n");
// kernel can run asynchronously CUDPPResult result;
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder); size_t numWithInteractions;
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder); switch (gpu->sim.nonbondedMethod)
if (threadIdx.x < end - pos)
{ {
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x]; case NO_CUTOFF:
} if (gpu->bOutputBufferPerWarp)
if (threadIdx.x < GRID) kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
{ sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1); else
} kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
__syncthreads(); sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
LAUNCHERROR("kCalculateCDLJN2Forces");
// Now change pos and end to reflect work queue just read break;
// into shared memory case CUTOFF:
end = end - pos; kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
pos = end - (threadIdx.x >> GRIDBITS) - 1; LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
while (pos >= 0) LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
{ result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
// Extract cell coordinates from appropriate work unit if (result != CUDPP_SUCCESS)
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{ {
// Read fixed atom data into registers and GRF printf("Error in cudppCompact: %d\n", result);
int j = y + tgx; exit(-1);
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
} }
} gpu->psInteractionCount->Download();
else // bExclusion numWithInteractions = gpu->psInteractionCount->_pSysData[0];
{ if (gpu->bOutputBufferPerWarp)
// Read exclusion data kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (x == y) // Handle diagonals uniquely at 50% efficiency else
{ kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
// Read fixed atom data into registers and GRF sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx]; LAUNCHERROR("kCalculateCDLJCutoffForces");
unsigned int i = x + tgx; break;
apos = cSim.pPosq[i]; case PERIODIC:
float2 a = cSim.pAttr[i]; kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
sA[threadIdx.x].x = apos.x; LAUNCHERROR("kFindBlockBoundsPeriodic");
sA[threadIdx.x].y = apos.y; kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
sA[threadIdx.x].z = apos.z; LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
sA[threadIdx.x].q = apos.w; result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
sA[threadIdx.x].sig = a.x; gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
sA[threadIdx.x].eps = a.y; if (result != CUDPP_SUCCESS)
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{ {
// Read fixed atom data into registers and GRF printf("Error in cudppCompact: %d\n", result);
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx]; exit(-1);
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
} }
} gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
pos -= cSim.nonbond_workBlock; if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
} }
}
__global__ extern void kCalculateCDLJForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJForces\n");
if (gpu->sm_version < SM_12)
kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
else
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces");
} }
\ No newline at end of file
...@@ -33,9 +33,6 @@ ...@@ -33,9 +33,6 @@
#include <cuda.h> #include <cuda.h>
#include <vector_functions.h> #include <vector_functions.h>
#include <cstdlib> #include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std; using namespace std;
#include "gputypes.h" #include "gputypes.h"
...@@ -54,15 +51,8 @@ struct Atom { ...@@ -54,15 +51,8 @@ struct Atom {
float fy; float fy;
float fz; float fz;
float fb; float fb;
float q2;
float junk;
}; };
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu) void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
...@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu) ...@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kCalculateCDLJObcGbsaForces1_kernel() // Include versions of the kernel for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{ {
// Read queue of work blocks once so the remainder of // printf("kCalculateCDLJObcGbsaForces1\n");
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read // check if Born radii need to be calculated
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part kClearBornForces(gpu);
float alpha2_ij = br * psA[j].br; CUDPPResult result;
float D_ij = r2 / (4.0f * alpha2_ij); size_t numWithInteractions;
float expTerm = exp(-D_ij); switch (gpu->sim.nonbondedMethod)
float denominator2 = r2 + alpha2_ij * expTerm; {
float denominator = sqrt(denominator2); case NO_CUTOFF:
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2); if (gpu->bRecalculateBornRadii)
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{ {
// Read fixed atom data into registers and GRF kCalculateObcGbsaBornSum(gpu);
int j = y + tgx; kReduceObcGbsaBornSum(gpu);
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
} }
} if (gpu->bOutputBufferPerWarp)
else // bExclusion kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
{ sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
// Read exclusion data else
kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
if (x == y) // Handle diagonals uniquely at 50% efficiency sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
{ LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
// Read fixed atom data into registers and GRF break;
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx]; case CUTOFF:
float4 af; kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
af.x = 0.0f; LAUNCHERROR("kFindBlockBoundsCutoff");
af.y = 0.0f; kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
af.z = 0.0f; LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
af.w = 0.0f; result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
sA[threadIdx.x].x = apos.x; gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
sA[threadIdx.x].y = apos.y; if (result != CUDPP_SUCCESS)
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{ {
// Read fixed atom data into registers and GRF printf("Error in cudppCompact: %d\n", result);
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx]; exit(-1);
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
excl = (excl >> tgx) | (excl << (GRID - tgx));
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
} }
} gpu->psInteractionCount->Download();
if (gpu->bRecalculateBornRadii)
pos -= cSim.nonbond_workBlock; {
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJCutoffForces");
break;
case PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
{
printf("Error in cudppCompact: %d\n", result);
exit(-1);
}
gpu->psInteractionCount->Download();
if (gpu->bRecalculateBornRadii)
{
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
break;
} }
} }
__global__ extern void kCalculateCDLJObcGbsaForces1_12_kernel();
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if( gpu->bRecalculateBornRadii ){
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
if (gpu->sm_version < SM_12)
kCalculateCDLJObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
else
kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
if( 0 ){
static int step = 0;
// int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
LAUNCHERROR("kCalculateCDLJObcGbsaForces1");
}
...@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
while (pos < cSim.LJ14_offset) if (cSim.nonbondedMethod == NO_CUTOFF)
{ {
unsigned int pos1 = pos - cSim.rb_dihedral_offset; while (pos < cSim.LJ14_offset)
if (pos1 < cSim.LJ14s)
{ {
int4 atom = cSim.pLJ14ID[pos1]; unsigned int pos1 = pos - cSim.rb_dihedral_offset;
float4 LJ14 = cSim.pLJ14Parameter[pos1]; if (pos1 < cSim.LJ14s)
float4 a1 = cSim.pPosq[atom.x]; {
float4 a2 = cSim.pPosq[atom.y]; int4 atom = cSim.pLJ14ID[pos1];
float3 d; float4 LJ14 = cSim.pLJ14Parameter[pos1];
d.x = a1.x - a2.x; float4 a1 = cSim.pPosq[atom.x];
d.y = a1.y - a2.y; float4 a2 = cSim.pPosq[atom.y];
d.z = a1.z - a2.z; float3 d;
float r2 = DOT3(d, d); d.x = a1.x - a2.x;
float inverseR = 1.0f / sqrt(r2); d.y = a1.y - a2.y;
float sig2 = inverseR * LJ14.y; d.z = a1.z - a2.z;
sig2 *= sig2; float r2 = DOT3(d, d);
float sig6 = sig2 * sig2 * sig2; float inverseR = 1.0f / sqrt(r2);
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6; float sig2 = inverseR * LJ14.y;
dEdR += LJ14.z * inverseR; sig2 *= sig2;
dEdR *= inverseR * inverseR; float sig6 = sig2 * sig2 * sig2;
unsigned int offsetA = atom.x + atom.z * cSim.stride; float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
unsigned int offsetB = atom.y + atom.w * cSim.stride; dEdR += LJ14.z * inverseR;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f}; dEdR *= inverseR * inverseR;
if (atom.z < cSim.totalNonbondOutputBuffers) unsigned int offsetA = atom.x + atom.z * cSim.stride;
forceA = cSim.pForce4[offsetA]; unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f}; float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers) if (atom.z < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB]; forceA = cSim.pForce4[offsetA];
d.x *= dEdR; float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
d.y *= dEdR; if (atom.w < cSim.totalNonbondOutputBuffers)
d.z *= dEdR; forceB = cSim.pForce4[offsetB];
forceA.x += d.x; d.x *= dEdR;
forceA.y += d.y; d.y *= dEdR;
forceA.z += d.z; d.z *= dEdR;
forceB.x -= d.x; forceA.x += d.x;
forceB.y -= d.y; forceA.y += d.y;
forceB.z -= d.z; forceA.z += d.z;
cSim.pForce4[offsetA] = forceA; forceB.x -= d.x;
cSim.pForce4[offsetB] = forceB; forceB.y -= d.y;
} forceB.z -= d.z;
pos += blockDim.x * gridDim.x; cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == PERIODIC)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
d.x -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
d.y -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
d.z -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
} }
} }
......
...@@ -53,10 +53,6 @@ struct Atom { ...@@ -53,10 +53,6 @@ struct Atom {
float junk; float junk;
}; };
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu) void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
...@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu) ...@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kReduceObcGbsaBornSum_kernel() __global__ void kReduceObcGbsaBornSum_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -127,175 +167,40 @@ if( 0 ){ ...@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR("kReduceObcGbsaBornSum"); LAUNCHERROR("kReduceObcGbsaBornSum");
} }
__global__ void kCalculateObcGbsaBornSum_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
if (psA[tj].r < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaBornSum(gpuContext gpu) void kCalculateObcGbsaBornSum(gpuContext gpu)
{ {
// printf("kCalculateObcgbsaBornSum\n"); // printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>(); kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateBornSum"); LAUNCHERROR("kCalculateBornSum");
} }
...@@ -52,18 +52,9 @@ struct Atom { ...@@ -52,18 +52,9 @@ struct Atom {
float fy; float fy;
float fz; float fz;
float fb; float fb;
// float sum;
// float oneOverR;
int pos;
int wx;
int wy;
}; };
__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu) void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
...@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu) ...@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kCalculateObcGbsaForces2_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read // Include versions of the kernels for N^2 calculations.
// into shared memory
end = end - pos;
sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (sA[threadIdx.x].pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].wx = x;
sA[threadIdx.x].wy = y;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j]) #define METHOD_NAME(a, b) a##N2##b
{ #include "kCalculateObcGbsaForces2.h"
float dx = psA[j].x - apos.x; #define USE_OUTPUT_BUFFER_PER_WARP
float dy = psA[j].y - apos.y; #undef METHOD_NAME
float dz = psA[j].z - apos.z; #define METHOD_NAME(a, b) a##N2ByWarp##b
float r2 = dx * dx + dy * dy + dz * dz; #include "kCalculateObcGbsaForces2.h"
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for (j = 0; j < GRID; j++) // Include versions of the kernels with cutoffs.
{
float dx = psA[tj].x - apos.x; #undef METHOD_NAME
float dy = psA[tj].y - apos.y; #undef USE_OUTPUT_BUFFER_PER_WARP
float dz = psA[tj].z - apos.z; #define USE_CUTOFF
float r2 = dx * dx + dy * dy + dz * dz; #define METHOD_NAME(a, b) a##Cutoff##b
float r = sqrt(r2); #include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
// Atom I Born Forces and sum #undef METHOD_NAME
float r2Inverse = 1.0f / r2; #define METHOD_NAME(a, b) a##CutoffByWarp##b
float rScaledRadiusJ = r + psA[tj].sr; #include "kCalculateObcGbsaForces2.h"
float rInverse = 1.0f / r;
float l_ij = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
// Atom J Born Forces and sum
float rScaledRadiusI = r + a.y;
l_ij = 1.0f / max(psA[tj].r, fabs(r - a.y));
u_ij = 1.0f / rScaledRadiusI;
l_ij2 = l_ij * l_ij;
u_ij2 = u_ij * u_ij;
t1 = log (u_ij / l_ij);
t2 = (l_ij2 - u_ij2);
t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
dE = psA[tj].fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
tj = sNext[tj];
}
// Write results
int offset = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA[threadIdx.x].pos -= cSim.bornForce2_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces2_12_kernel(); // Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void kCalculateObcGbsaForces2(gpuContext gpu) void kCalculateObcGbsaForces2(gpuContext gpu)
{ {
//printf("kCalculateObcGbsaForces2\n"); //printf("kCalculateObcGbsaForces2\n");
if (gpu->sm_version < SM_12) size_t numWithInteractions;
kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>(); switch (gpu->sim.nonbondedMethod)
else {
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>(); case NO_CUTOFF:
if( 0 ){ if (gpu->bOutputBufferPerWarp)
static int step = 0; kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
//int numPrint = -1; sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
step++; else
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint ); kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
//gpuDumpCoordinates( gpu ); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
kReduceBornSumAndForces( gpu ); break;
gpuDumpObcLoop1( gpu ); case CUTOFF:
} numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2"); LAUNCHERROR("kCalculateObcGbsaForces2");
} }
...@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu) ...@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__ void kClearForces_kernel() __global__ void kClearForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride4 * cSim.outputBuffers) while (pos < cSim.stride * cSim.outputBuffers)
{ {
((float*)cSim.pForce4)[pos] = 0.0f; cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x; pos += gridDim.x * blockDim.x;
} }
} }
......
...@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu) ...@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__ void kVerletUpdatePart1_kernel() __global__ void kVerletUpdatePart1_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
...@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu) ...@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__ void kVerletUpdatePart2_kernel() __global__ void kVerletUpdatePart2_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
...@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel() ...@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern __shared__ float3 sCM[]; extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f}; float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008 Stanford University and the Authors. * * Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS ...@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters->setScaledRadiusFactors(scaleFactors); obcParameters->setScaledRadiusFactors(scaleFactors);
obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) ); obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) ); obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for (int i = 0; i < system.getNumForces(); i++) {
const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
if (nonbonded != NULL) {
if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
Vec3 boxVectors[3];
nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
RealOpenMM periodicBoxSize[3];
periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
obcParameters->setPeriodic(periodicBoxSize);
}
break;
}
}
obc = new CpuObc(obcParameters); obc = new CpuObc(obcParameters);
obc->setIncludeAceApproximation(true); obc->setIncludeAceApproximation(true);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment