Commit 1010df33 authored by Peter Eastman's avatar Peter Eastman
Browse files

Checking in Cuda implementation of explicit solvent

parent df4b64cb
#--------------------------------------------------- #---------------------------------------------------
# OpenMM CUDA Platform # OpenMM CUDA Platform
# #
# Creates OpenMM library, base name=OpenMMCuda. # Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants # Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d). # are created for static (_static) and debug (_d).
# #
# Windows: # Windows:
# OpenMMCuda[_d].dll # OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib # OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib # OpenMMCuda_static[_d].lib
# Unix: # Unix:
# libOpenMMCuda[_d].so # libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a # libOpenMMCuda_static[_d].a
#---------------------------------------------------- #----------------------------------------------------
SUBDIRS (tests) SUBDIRS (tests)
# The source is organized into subdirectories, but we handle them all from # The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS. # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .) SET(OPENMM_SOURCE_SUBDIRS .)
# Collect up information about the version of the OpenMM library we're building # Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries. # and make it available to the code so it can be built into the binaries.
SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda) SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME}) SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static) SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
# Ensure that debug libraries have "_d" appended to their names. # Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition. # CMake gets this right on Windows automatically with this definition.
IF (${CMAKE_GENERATOR} MATCHES "Visual Studio") IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE) SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio") ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
# But on Unix or Cygwin we have to add the suffix manually # But on Unix or Cygwin we have to add the suffix manually
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(SHARED_TARGET ${SHARED_TARGET}_d) SET(SHARED_TARGET ${SHARED_TARGET}_d)
SET(STATIC_TARGET ${STATIC_TARGET}_d) SET(STATIC_TARGET ${STATIC_TARGET}_d)
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
# These are all the places to search for header files which are # These are all the places to search for header files which are
# to be part of the API. # to be part of the API.
SET(API_INCLUDE_DIRS) # start empty SET(API_INCLUDE_DIRS) # start empty
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
# append # append
SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS} SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal) ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
ENDFOREACH(subdir) ENDFOREACH(subdir)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS, # We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames. # and absolute pathnames.
SET(API_REL_INCLUDE_FILES) # start these out empty SET(API_REL_INCLUDE_FILES) # start these out empty
SET(API_ABS_INCLUDE_FILES) SET(API_ABS_INCLUDE_FILES)
FOREACH(dir ${API_INCLUDE_DIRS}) FOREACH(dir ${API_INCLUDE_DIRS})
FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames
SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths}) SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
FOREACH(pathname ${fullpaths}) FOREACH(pathname ${fullpaths})
GET_FILENAME_COMPONENT(filename ${pathname} NAME) GET_FILENAME_COMPONENT(filename ${pathname} NAME)
SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename}) SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
ENDFOREACH(pathname) ENDFOREACH(pathname)
ENDFOREACH(dir) ENDFOREACH(dir)
# collect up source files # collect up source files
SET(SOURCE_FILES) # empty SET(SOURCE_FILES) # empty
SET(SOURCE_INCLUDE_FILES) SET(SOURCE_INCLUDE_FILES)
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp) FILE(GLOB src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp)
FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h) FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append
SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files}) SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir) ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake) SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
SUBDIRS (sharedTarget staticTarget) IF (APPLE)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
ELSE (APPLE)
IF (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
ELSE (WIN32)
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
ENDIF (WIN32)
ENDIF(APPLE)
SUBDIRS (sharedTarget staticTarget)
...@@ -39,42 +39,40 @@ ...@@ -39,42 +39,40 @@
using namespace OpenMM; using namespace OpenMM;
StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const { StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == "particlePositions") { if (name == "particlePositions") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f}; float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
} }
if (name == "particleVelocities") { if (name == "particleVelocities") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f}; float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
} }
if (name == "particleForces") { if (name == "particleForces") {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
float padding[] = {0.0f, 0.0f, 0.0f, 0.0f}; float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu); return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
} }
switch (type) { switch (type) {
case Stream::Float: case Stream::Float:
case Stream::Double: case Stream::Double:
return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
case Stream::Float2: case Stream::Float2:
case Stream::Double2: case Stream::Double2:
return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
case Stream::Float3: case Stream::Float3:
case Stream::Double3: case Stream::Double3:
return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
case Stream::Float4: case Stream::Float4:
case Stream::Double4: case Stream::Double4:
return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
case Stream::Integer: case Stream::Integer:
return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
case Stream::Integer2: case Stream::Integer2:
return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
case Stream::Integer3: case Stream::Integer3:
return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
case Stream::Integer4: case Stream::Integer4:
return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL); return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
} }
throw OpenMMException("Tried to create a Stream with an illegal DataType."); throw OpenMMException("Tried to create a Stream with an illegal DataType.");
} }
...@@ -55,7 +55,7 @@ public: ...@@ -55,7 +55,7 @@ public:
private: private:
void initType(); void initType();
CUDAStream<T>* stream; CUDAStream<T>* stream;
_gpuContext* gpu; _gpuContext* gpu;
bool ownStream; bool ownStream;
int width, rowOffset; int width, rowOffset;
float paddingValues[4]; float paddingValues[4];
...@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() { ...@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template <class T> template <class T>
void CudaStreamImpl<T>::loadFromArray(const void* array) { void CudaStreamImpl<T>::loadFromArray(const void* array) {
float* data = reinterpret_cast<float*>(stream->_pSysData); float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) { if (baseType == Stream::Float) {
float* arrayData = (float*) array; float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = arrayData[i*width+j]; data[i*rowOffset+j] = arrayData[order[i]*width+j];
} }
else if (baseType == Stream::Double) { else if (baseType == Stream::Double) {
double* arrayData = (double*) array; double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j]; data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
} }
else { else {
int* arrayData = (int*) array; int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
data[i*rowOffset+j] = (float) arrayData[i*width+j]; data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
} }
for (int i = getSize(); i < (int) stream->_length; ++i) for (int i = getSize(); i < (int) stream->_length; ++i)
for (int j = 0; j < rowOffset; ++j) for (int j = 0; j < rowOffset; ++j)
data[i*rowOffset+j] = paddingValues[j]; data[i*rowOffset+j] = paddingValues[j];
stream->Upload(); stream->Upload();
// VisualStudio compiler did not like stream == gpu->psPosq4 // VisualStudio compiler did not like stream == gpu->psPosq4
//if( gpu && stream == gpu->psPosq4 ){ //if( gpu && stream == gpu->psPosq4 ){
if( gpu && getName() == "particlePositions" ){ if( gpu && getName() == "particlePositions" ){
gpu->bRecalculateBornRadii = true; gpu->bRecalculateBornRadii = true;
} }
} }
template <class T> template <class T>
void CudaStreamImpl<T>::saveToArray(void* array) { void CudaStreamImpl<T>::saveToArray(void* array) {
stream->Download(); stream->Download();
float* data = reinterpret_cast<float*>(stream->_pSysData); float* data = reinterpret_cast<float*>(stream->_pSysData);
int* order = gpu->psAtomIndex->_pSysData;
if (baseType == Stream::Float) { if (baseType == Stream::Float) {
float* arrayData = (float*) array; float* arrayData = (float*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j]; arrayData[order[i]*width+j] = data[i*rowOffset+j];
} }
else if (baseType == Stream::Double) { else if (baseType == Stream::Double) {
double* arrayData = (double*) array; double* arrayData = (double*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = data[i*rowOffset+j]; arrayData[order[i]*width+j] = data[i*rowOffset+j];
} }
else { else {
int* arrayData = (int*) array; int* arrayData = (int*) array;
for (int i = 0; i < getSize(); ++i) for (int i = 0; i < getSize(); ++i)
for (int j = 0; j < width; ++j) for (int j = 0; j < width; ++j)
arrayData[i*width+j] = (int) data[i*rowOffset+j]; arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
} }
} }
......
...@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu); ...@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu); extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu); extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu); extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJForces_12(gpuContext gpu);
extern void kCalculateObcGbsaForces1(gpuContext gpu); extern void kCalculateObcGbsaForces1(gpuContext gpu);
extern void kCalculateObcGbsaForces1_12(gpuContext gpu); extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu); extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu); extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu); extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu); extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu); extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyFirstShake(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu); extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu); extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu); extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu); extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu); extern void kBrownianUpdatePart1(gpuContext gpu);
...@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu); ...@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers // Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu); extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu); extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu); extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu); extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu); extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu); extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu); extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
...@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu); ...@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu); extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu); extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu); extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu); extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu); extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu); extern void SetUpdateShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu); extern void GetUpdateShakeHSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu); extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu); extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu); extern void SetBrownianUpdateSim(gpuContext gpu);
......
...@@ -36,11 +36,12 @@ ...@@ -36,11 +36,12 @@
#include <limits> #include <limits>
#include <iostream> #include <iostream>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <builtin_types.h> #include <builtin_types.h>
#include <vector_functions.h> #include <vector_functions.h>
using namespace std;
#define RTERROR(status, s) \ #define RTERROR(status, s) \
if (status != cudaSuccess) { \ if (status != cudaSuccess) { \
...@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream() ...@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream()
template <typename T> template <typename T>
void CUDAStream<T>::Allocate() void CUDAStream<T>::Allocate()
{ {
cudaError_t status; cudaError_t status;
_pSysStream = new T*[_subStreams]; _pSysStream = new T*[_subStreams];
_pDevStream = new T*[_subStreams]; _pDevStream = new T*[_subStreams];
_pSysData = new T[_subStreams * _stride]; _pSysData = new T[_subStreams * _stride];
...@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384; ...@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220; static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC
};
struct cudaGmxSimulation { struct cudaGmxSimulation {
// Constants // Constants
...@@ -236,6 +243,7 @@ struct cudaGmxSimulation { ...@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned int blocks; // Number of blocks to launch across linear kernels unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1 unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2 unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
unsigned int threads_per_block; // Threads per block to launch unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
...@@ -245,12 +253,17 @@ struct cudaGmxSimulation { ...@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int interaction_threads_per_block; // Threads per block when identifying interacting tiles
unsigned int workUnits; // Number of work units unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units unsigned int* pWorkUnit; // Pointer to work units
unsigned int* pInteractingWorkUnit; // Pointer to work units that have interactions
unsigned int* pInteractionFlag; // Flags for which work units have interactions
size_t* pInteractionCount; // A count of the number of work units which have interactions
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1 unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM unsigned int workUnitsPerSM; // Number of workblocks per SM
...@@ -270,6 +283,12 @@ struct cudaGmxSimulation { ...@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned int outputBuffers; // Number of output buffers unsigned int outputBuffers; // Number of output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
float nonbondedCutoffSqr; // Cutoff distance for CDLJ calculations
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float reactionFieldK; // Constant for reaction field correction
float probeRadius; // SASA probe radius float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant float electricConstant; // ACE approximation electric constant
...@@ -326,6 +345,7 @@ struct cudaGmxSimulation { ...@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int NonShakeConstraints; // Total number of NonShake atoms unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system unsigned int degreesOfFreedom; // Number of degrees of freedom in system
...@@ -334,12 +354,17 @@ struct cudaGmxSimulation { ...@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int* pNonShakeID; // Not Shaking atoms int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data unsigned int* pExclusion; // Nonbond exclusion data
unsigned int bond_offset; // Offset to end of bonds unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
// Mutable stuff // Mutable stuff
float4* pPosq; // Pointer to atom positions and charges float4* pPosq; // Pointer to atom positions and charges
......
This diff is collapsed.
...@@ -33,14 +33,20 @@ ...@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "cudatypes.h" #include "cudatypes.h"
#include "cudpp.h"
#include <vector> #include <vector>
struct gpuAtomType { struct gpuAtomType {
string name; std::string name;
char symbol; char symbol;
float r; float r;
}; };
struct gpuMoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
};
enum SM_VERSION enum SM_VERSION
{ {
SM_10, SM_10,
...@@ -61,8 +67,9 @@ struct _gpuContext { ...@@ -61,8 +67,9 @@ struct _gpuContext {
int gAtomTypes; int gAtomTypes;
cudaGmxSimulation sim; cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter; unsigned int* pOutputBufferCounter;
unsigned int* pExclusion; std::vector<std::vector<int> > exclusions;
unsigned char* pAtomSymbol; unsigned char* pAtomSymbol;
std::vector<gpuMoleculeGroup> moleculeGroups;
float iterations; float iterations;
float epsfac; float epsfac;
float solventDielectric; float solventDielectric;
...@@ -70,9 +77,12 @@ struct _gpuContext { ...@@ -70,9 +77,12 @@ struct _gpuContext {
int grid; int grid;
bool bCalculateCM; bool bCalculateCM;
bool bRemoveCM; bool bRemoveCM;
bool bRecalculateBornRadii; bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
unsigned long seed; unsigned long seed;
SM_VERSION sm_version; SM_VERSION sm_version;
CUDPPHandle cudpp;
CUDAStream<float4>* psPosq4; CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4; CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4; CUDAStream<float4>* psOldPosq4;
...@@ -103,15 +113,21 @@ struct _gpuContext { ...@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream<int>* psNonShakeID; CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID; CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter; CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion; CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psWorkUnit; CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
CUDAStream<size_t>* psInteractionCount;
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
CUDAStream<int>* psAtomIndex; // The original index of each atom
CUDAStream<float4>* psGridBoundingBox; // The size of each grid cell
CUDAStream<float4>* psGridCenter; // The center and radius for each grid cell
}; };
typedef struct _gpuContext *gpuContext; typedef struct _gpuContext *gpuContext;
...@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std:: ...@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2); const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C" extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s); float gpuGetAtomicRadius(gpuContext gpu, std::string s);
extern "C" extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s); unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);
extern "C" extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname); int gpuReadAtomicParameters(gpuContext gpu, char* fname);
...@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname); ...@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern "C" extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q, void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions); const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
extern "C" extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale); void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
...@@ -227,7 +249,7 @@ extern "C" ...@@ -227,7 +249,7 @@ extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu); int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C" extern "C"
int gpuBuildExclusionList(gpuContext gpu); void gpuBuildExclusionList(gpuContext gpu);
extern "C" extern "C"
int gpuSetConstants(gpuContext gpu); int gpuSetConstants(gpuContext gpu);
...@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu); ...@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern "C" extern "C"
void gpuDumpObcLoop1(gpuContext gpu); void gpuDumpObcLoop1(gpuContext gpu);
extern "C"
void gpuReorderAtoms(gpuContext gpu);
#endif //__GPUTYPES_H__ #endif //__GPUTYPES_H__
...@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel() ...@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel()
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
while (pos < cSim.LJ14_offset) if (cSim.nonbondedMethod == NO_CUTOFF)
{ {
unsigned int pos1 = pos - cSim.rb_dihedral_offset; while (pos < cSim.LJ14_offset)
if (pos1 < cSim.LJ14s)
{ {
int4 atom = cSim.pLJ14ID[pos1]; unsigned int pos1 = pos - cSim.rb_dihedral_offset;
float4 LJ14 = cSim.pLJ14Parameter[pos1]; if (pos1 < cSim.LJ14s)
float4 a1 = cSim.pPosq[atom.x]; {
float4 a2 = cSim.pPosq[atom.y]; int4 atom = cSim.pLJ14ID[pos1];
float3 d; float4 LJ14 = cSim.pLJ14Parameter[pos1];
d.x = a1.x - a2.x; float4 a1 = cSim.pPosq[atom.x];
d.y = a1.y - a2.y; float4 a2 = cSim.pPosq[atom.y];
d.z = a1.z - a2.z; float3 d;
float r2 = DOT3(d, d); d.x = a1.x - a2.x;
float inverseR = 1.0f / sqrt(r2); d.y = a1.y - a2.y;
float sig2 = inverseR * LJ14.y; d.z = a1.z - a2.z;
sig2 *= sig2; float r2 = DOT3(d, d);
float sig6 = sig2 * sig2 * sig2; float inverseR = 1.0f / sqrt(r2);
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6; float sig2 = inverseR * LJ14.y;
dEdR += LJ14.z * inverseR; sig2 *= sig2;
dEdR *= inverseR * inverseR; float sig6 = sig2 * sig2 * sig2;
unsigned int offsetA = atom.x + atom.z * cSim.stride; float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
unsigned int offsetB = atom.y + atom.w * cSim.stride; dEdR += LJ14.z * inverseR;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f}; dEdR *= inverseR * inverseR;
if (atom.z < cSim.totalNonbondOutputBuffers) unsigned int offsetA = atom.x + atom.z * cSim.stride;
forceA = cSim.pForce4[offsetA]; unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f}; float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers) if (atom.z < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB]; forceA = cSim.pForce4[offsetA];
d.x *= dEdR; float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
d.y *= dEdR; if (atom.w < cSim.totalNonbondOutputBuffers)
d.z *= dEdR; forceB = cSim.pForce4[offsetB];
forceA.x += d.x; d.x *= dEdR;
forceA.y += d.y; d.y *= dEdR;
forceA.z += d.z; d.z *= dEdR;
forceB.x -= d.x; forceA.x += d.x;
forceB.y -= d.y; forceA.y += d.y;
forceB.z -= d.z; forceA.z += d.z;
cSim.pForce4[offsetA] = forceA; forceB.x -= d.x;
cSim.pForce4[offsetB] = forceB; forceB.y -= d.y;
} forceB.z -= d.z;
pos += blockDim.x * gridDim.x; cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == CUTOFF)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
else if (cSim.nonbondedMethod == PERIODIC)
{
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
d.x -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
d.y -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
d.z -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
} }
} }
......
...@@ -53,10 +53,6 @@ struct Atom { ...@@ -53,10 +53,6 @@ struct Atom {
float junk; float junk;
}; };
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu) void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
...@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu) ...@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kReduceObcGbsaBornSum_kernel() __global__ void kReduceObcGbsaBornSum_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -127,175 +167,40 @@ if( 0 ){ ...@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR("kReduceObcGbsaBornSum"); LAUNCHERROR("kReduceObcGbsaBornSum");
} }
__global__ void kCalculateObcGbsaBornSum_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
if (psA[tj].r < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaBornSum(gpuContext gpu) void kCalculateObcGbsaBornSum(gpuContext gpu)
{ {
// printf("kCalculateObcgbsaBornSum\n"); // printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>(); kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornSum");
size_t numWithInteractions;
switch (gpu->sim.nonbondedMethod)
{
case NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
else
kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
break;
case CUTOFF:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateBornSum"); LAUNCHERROR("kCalculateBornSum");
} }
...@@ -52,18 +52,9 @@ struct Atom { ...@@ -52,18 +52,9 @@ struct Atom {
float fy; float fy;
float fz; float fz;
float fb; float fb;
// float sum;
// float oneOverR;
int pos;
int wx;
int wy;
}; };
__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu) void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
...@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu) ...@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kCalculateObcGbsaForces2_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read // Include versions of the kernels for N^2 calculations.
// into shared memory
end = end - pos;
sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (sA[threadIdx.x].pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].wx = x;
sA[threadIdx.x].wy = y;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j]) #define METHOD_NAME(a, b) a##N2##b
{ #include "kCalculateObcGbsaForces2.h"
float dx = psA[j].x - apos.x; #define USE_OUTPUT_BUFFER_PER_WARP
float dy = psA[j].y - apos.y; #undef METHOD_NAME
float dz = psA[j].z - apos.z; #define METHOD_NAME(a, b) a##N2ByWarp##b
float r2 = dx * dx + dy * dy + dz * dz; #include "kCalculateObcGbsaForces2.h"
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for (j = 0; j < GRID; j++) // Include versions of the kernels with cutoffs.
{
float dx = psA[tj].x - apos.x; #undef METHOD_NAME
float dy = psA[tj].y - apos.y; #undef USE_OUTPUT_BUFFER_PER_WARP
float dz = psA[tj].z - apos.z; #define USE_CUTOFF
float r2 = dx * dx + dy * dy + dz * dz; #define METHOD_NAME(a, b) a##Cutoff##b
float r = sqrt(r2); #include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
// Atom I Born Forces and sum #undef METHOD_NAME
float r2Inverse = 1.0f / r2; #define METHOD_NAME(a, b) a##CutoffByWarp##b
float rScaledRadiusJ = r + psA[tj].sr; #include "kCalculateObcGbsaForces2.h"
float rInverse = 1.0f / r;
float l_ij = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
// Atom J Born Forces and sum
float rScaledRadiusI = r + a.y;
l_ij = 1.0f / max(psA[tj].r, fabs(r - a.y));
u_ij = 1.0f / rScaledRadiusI;
l_ij2 = l_ij * l_ij;
u_ij2 = u_ij * u_ij;
t1 = log (u_ij / l_ij);
t2 = (l_ij2 - u_ij2);
t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
dE = psA[tj].fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
tj = sNext[tj];
}
// Write results
int offset = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA[threadIdx.x].pos -= cSim.bornForce2_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces2_12_kernel(); // Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void kCalculateObcGbsaForces2(gpuContext gpu) void kCalculateObcGbsaForces2(gpuContext gpu)
{ {
//printf("kCalculateObcGbsaForces2\n"); //printf("kCalculateObcGbsaForces2\n");
if (gpu->sm_version < SM_12) size_t numWithInteractions;
kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>(); switch (gpu->sim.nonbondedMethod)
else {
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>(); case NO_CUTOFF:
if( 0 ){ if (gpu->bOutputBufferPerWarp)
static int step = 0; kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
//int numPrint = -1; sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
step++; else
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint ); kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
//gpuDumpCoordinates( gpu ); sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
kReduceBornSumAndForces( gpu ); break;
gpuDumpObcLoop1( gpu ); case CUTOFF:
} numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
case PERIODIC:
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
break;
}
LAUNCHERROR("kCalculateObcGbsaForces2"); LAUNCHERROR("kCalculateObcGbsaForces2");
} }
...@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu) ...@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__ void kClearForces_kernel() __global__ void kClearForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride4 * cSim.outputBuffers) while (pos < cSim.stride * cSim.outputBuffers)
{ {
((float*)cSim.pForce4)[pos] = 0.0f; cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
pos += gridDim.x * blockDim.x; pos += gridDim.x * blockDim.x;
} }
} }
......
...@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu) ...@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__ void kVerletUpdatePart1_kernel() __global__ void kVerletUpdatePart1_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
...@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu) ...@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__ void kVerletUpdatePart2_kernel() __global__ void kVerletUpdatePart2_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
...@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel() ...@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern __shared__ float3 sCM[]; extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f}; float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008 Stanford University and the Authors. * * Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS ...@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters->setScaledRadiusFactors(scaleFactors); obcParameters->setScaledRadiusFactors(scaleFactors);
obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) ); obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) ); obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for (int i = 0; i < system.getNumForces(); i++) {
const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
if (nonbonded != NULL) {
if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
Vec3 boxVectors[3];
nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
RealOpenMM periodicBoxSize[3];
periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
obcParameters->setPeriodic(periodicBoxSize);
}
break;
}
}
obc = new CpuObc(obcParameters); obc = new CpuObc(obcParameters);
obc->setIncludeAceApproximation(true); obc->setIncludeAceApproximation(true);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment