Commit 2e451b9d authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted the old CUDA platform

parent 352e2fc7
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib
# Unix:
# libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a
#----------------------------------------------------
set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
if(OPENMM_BUILD_CUDA_TESTS)
SUBDIRS (tests)
endif(OPENMM_BUILD_CUDA_TESTS)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
# But on Unix or Cygwin we have to add the suffix manually
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(SHARED_TARGET ${SHARED_TARGET}_d)
SET(STATIC_TARGET ${STATIC_TARGET}_d)
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
# These are all the places to search for header files which are
# to be part of the API.
SET(API_INCLUDE_DIRS) # start empty
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
# append
SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
ENDFOREACH(subdir)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET(API_REL_INCLUDE_FILES) # start these out empty
SET(API_ABS_INCLUDE_FILES)
FOREACH(dir ${API_INCLUDE_DIRS})
FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames
SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
FOREACH(pathname ${fullpaths})
GET_FILENAME_COMPONENT(filename ${pathname} NAME)
SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
ENDFOREACH(pathname)
ENDFOREACH(dir)
# collect up source files
SET(SOURCE_FILES) # empty
SET(SOURCE_INCLUDE_FILES)
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB_RECURSE src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append
SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
# SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
SUBDIRS (sharedTarget)
#ifndef OPENMM_CUDAKERNELFACTORY_H_
#define OPENMM_CUDAKERNELFACTORY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/KernelFactory.h"
#include "windowsExportCuda.h"
namespace OpenMM {
/**
* This KernelFactory creates all kernels for CudaPlatform.
*/
class CudaKernelFactory : public KernelFactory {
public:
OPENMMCUDA_EXPORT KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
#ifndef OPENMM_CUDAPLATFORM_H_
#define OPENMM_CUDAPLATFORM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Platform.h"
#include "windowsExportCuda.h"
struct _gpuContext;
namespace OpenMM {
/**
* This Platform subclass uses CUDA implementations of the OpenMM kernels to run on NVidia GPUs.
*/
class OPENMMCUDA_EXPORT CudaPlatform : public Platform {
public:
class PlatformData;
CudaPlatform();
const std::string& getName() const {
static const std::string name = "Cuda";
return name;
}
double getSpeed() const {
return 50;
}
bool supportsDoublePrecision() const;
const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void contextDestroyed(ContextImpl& context) const;
/**
* This is the name of the parameter for selecting which CUDA device to use.
*/
static const std::string& CudaDevice() {
static const std::string key = "CudaDevice";
return key;
}
/**
* This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
*/
static const std::string& CudaUseBlockingSync() {
static const std::string key = "CudaUseBlockingSync";
return key;
}
};
class CudaPlatform::PlatformData {
public:
OPENMMCUDA_EXPORT PlatformData(_gpuContext* gpu);
_gpuContext* gpu;
bool removeCM;
bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded, hasCustomNonbonded;
int nonbondedMethod, customNonbondedMethod;
int cmMotionFrequency;
int stepCount, computeForceCount;
double time, ewaldSelfEnergy, dispersionCoefficient;
std::map<std::string, std::string> propertyValues;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAPLATFORM_H_*/
#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
#define OPENMM_WINDOWSEXPORTCUDA_H_
/*
* Shared libraries are messy in Visual Studio. We have to distinguish three
* cases:
* (1) this header is being used to build the OpenMM shared library
* (dllexport)
* (2) this header is being used by a *client* of the OpenMM shared
* library (dllimport)
* (3) we are building the OpenMM static library, or the client is
* being compiled with the expectation of linking with the
* OpenMM static library (nothing special needed)
* In the CMake script for building this library, we define one of the symbols
* OpenMMCUDA_BUILDING_{SHARED|STATIC}_LIBRARY
* Client code normally has no special symbol defined, in which case we'll
* assume it wants to use the shared library. However, if the client defines
* the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
* that the client code can be linked with static libraries. Note that
* the client symbol is not library dependent, while the library symbols
* affect only the OpenMM library, meaning that other libraries can
* be clients of this one. However, we are assuming all-static or all-shared.
*/
#ifdef _MSC_VER
// We don't want to hear about how sprintf is "unsafe".
#pragma warning(disable:4996)
// Keep MS VC++ quiet about lack of dll export of private members.
#pragma warning(disable:4251)
#if defined(OPENMMCUDA_BUILDING_SHARED_LIBRARY)
#define OPENMMCUDA_EXPORT __declspec(dllexport)
#elif defined(OPENMMCUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMMCUDA_USE_STATIC_LIBRARIES)
#define OPENMMCUDA_EXPORT
#else
#define OPENMMCUDA_EXPORT __declspec(dllimport) // i.e., a client of a shared library
#endif
#else
#define OPENMMCUDA_EXPORT // Linux, Mac
#endif
#endif // OPENMM_WINDOWSEXPORTCUDA_H_
#
# Include CUDA related files.
#
# INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
LINK_DIRECTORIES(${CUDA_TARGET_LINK})
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB src_files ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*.cu ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*/*.cu)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files})
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
ENDFOREACH(subdir)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/openmmapi/include)
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
IF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
# NVCC doesn't know how to build universal binaries, so we need to build two separate versions.
SET(BASE_FLAGS ${CUDA_NVCC_FLAGS})
SET(CMAKE_OSX_ARCHITECTURES i386)
SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m32)
CUDA_ADD_LIBRARY("${SHARED_TARGET}32" SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET}32 ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET}32 PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
SET(CMAKE_OSX_ARCHITECTURES x86_64)
SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m64)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
ADD_DEPENDENCIES(${SHARED_TARGET} "${SHARED_TARGET}32")
# Join them into a single universal binary.
ADD_CUSTOM_COMMAND(
TARGET ${SHARED_TARGET}
POST_BUILD
COMMAND /usr/bin/lipo lib${SHARED_TARGET}.dylib lib${SHARED_TARGET}32.dylib -create -output lib${SHARED_TARGET}.dylib
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Creating universal binary")
ELSE(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
ENDIF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaForceInfo.h"
using namespace OpenMM;
using namespace std;
bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
return true;
}
int CudaForceInfo::getNumParticleGroups() {
return 0;
}
void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
return;
}
bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
return true;
}
#ifndef OPENMM_CUDAFORCEINFO_H_
#define OPENMM_CUDAFORCEINFO_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <vector>
namespace OpenMM {
/**
* This class is used by the Cuda implementation of a Force class to convey information
* about the behavior and requirements of that force.
*/
class CudaForceInfo {
public:
CudaForceInfo() {
}
virtual ~CudaForceInfo() {
}
/**
* Get whether or not two particles have identical force field parameters.
*/
virtual OPENMM_EXPORT bool areParticlesIdentical(int particle1, int particle2);
/**
* Get the number of particle groups defined by this force.
*/
virtual OPENMM_EXPORT int getNumParticleGroups();
/**
* Get the list of particles in a particular group.
*/
virtual OPENMM_EXPORT void getParticlesInGroup(int index, std::vector<int>& particles);
/**
* Get whether two particle groups are identical.
*/
virtual OPENMM_EXPORT bool areGroupsIdentical(int group1, int group2);
};
} // namespace OpenMM
#endif /*OPENMM_CUDAFORCEINFO_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
OPENMMCUDA_EXPORT KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == CalcForcesAndEnergyKernel::Name())
return new CudaCalcForcesAndEnergyKernel(name, platform, data);
if (name == UpdateStateDataKernel::Name())
return new CudaUpdateStateDataKernel(name, platform, data);
if (name == ApplyConstraintsKernel::Name())
return new CudaApplyConstraintsKernel(name, platform, data);
if (name == VirtualSitesKernel::Name())
return new CudaVirtualSitesKernel(name, platform);
if (name == CalcHarmonicBondForceKernel::Name())
return new CudaCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new CudaCalcCustomBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new CudaCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new CudaCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CudaCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new CudaCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new CudaCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CudaCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new CudaCalcNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CudaCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CudaCalcGBSAOBCForceKernel(name, platform, data);
if (name == CalcGBVIForceKernel::Name())
return new CudaCalcGBVIForceKernel(name, platform, data);
if (name == CalcCustomExternalForceKernel::Name())
return new CudaCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
if (name == IntegrateVerletStepKernel::Name())
return new CudaIntegrateVerletStepKernel(name, platform, data);
if (name == IntegrateLangevinStepKernel::Name())
return new CudaIntegrateLangevinStepKernel(name, platform, data);
if (name == IntegrateBrownianStepKernel::Name())
return new CudaIntegrateBrownianStepKernel(name, platform, data);
if (name == IntegrateVariableVerletStepKernel::Name())
return new CudaIntegrateVariableVerletStepKernel(name, platform, data);
if (name == IntegrateVariableLangevinStepKernel::Name())
return new CudaIntegrateVariableLangevinStepKernel(name, platform, data);
if (name == ApplyAndersenThermostatKernel::Name())
return new CudaApplyAndersenThermostatKernel(name, platform, data);
if (name == ApplyMonteCarloBarostatKernel::Name())
return new CudaApplyMonteCarloBarostatKernel(name, platform, data);
if (name == CalcKineticEnergyKernel::Name())
return new CudaCalcKineticEnergyKernel(name, platform, data);
if (name == RemoveCMMotionKernel::Name())
return new CudaRemoveCMMotionKernel(name, platform, data);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
This diff is collapsed.
This diff is collapsed.
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "kernels/gputypes.h"
#include "openmm/Context.h"
#include "openmm/OpenMMException.h"
#include "openmm/System.h"
#include <sstream>
using namespace OpenMM;
using std::map;
using std::string;
using std::stringstream;
extern "C" OPENMMCUDA_EXPORT void registerPlatforms() {
if (gpuIsAvailable())
Platform::registerPlatform(new CudaPlatform());
}
CudaPlatform::CudaPlatform() {
CudaKernelFactory* factory = new CudaKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(CalcGBVIForceKernel::Name(), factory);
registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(CudaDevice());
platformProperties.push_back(CudaUseBlockingSync());
setPropertyDefaultValue(CudaDevice(), "0");
setPropertyDefaultValue(CudaUseBlockingSync(), "true");
}
bool CudaPlatform::supportsDoublePrecision() const {
return false;
}
const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
const ContextImpl& impl = getContextImpl(context);
const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
map<string, string>::const_iterator value = data->propertyValues.find(property);
if (value != data->propertyValues.end())
return value->second;
return Platform::getPropertyValue(context, property);
}
void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}
void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
System& system = context.getSystem();
for (int i = 0; i < system.getNumParticles(); i++)
if (system.isVirtualSite(i))
throw OpenMMException("CudaPlatform does not support virtual sites");
for (int i = 0; i < system.getNumForces(); i++)
if (system.getForce(i).getForceGroup() != 0)
throw OpenMMException("CudaPlatform does not support force groups");
unsigned int device = 0;
const string& devicePropValue = (properties.find(CudaDevice()) == properties.end() ?
getPropertyDefaultValue(CudaDevice()) : properties.find(CudaDevice())->second);
if (devicePropValue.length() > 0)
stringstream(devicePropValue) >> device;
int numParticles = context.getSystem().getNumParticles();
const string& blockingSync = (properties.find(CudaUseBlockingSync()) == properties.end() ?
getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
_gpuContext* gpu = (_gpuContext*) gpuInit(numParticles, device, blockingSync == "true");
context.setPlatformData(new PlatformData(gpu));
}
void CudaPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
gpuShutDown(data->gpu);
delete data;
}
CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), customNonbondedMethod(0), hasBonds(false), hasAngles(false),
hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), hasCustomNonbonded(false), stepCount(0), computeForceCount(0), time(0.0),
ewaldSelfEnergy(0.0), dispersionCoefficient(0.0) {
stringstream device;
device << gpu->device;
propertyValues[CudaPlatform::CudaDevice()] = device.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = (gpu->useBlockingSync ? "true" : "false");
}
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "vector_types.h"
#include "bbsort.h"
#include "bbsort_kernel.cu"
int getValue(int2 v){
return v.y;
}
template <typename T>
T getValue(T v){
return v;
}
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
{
int i=0;
bool overflow=false;
int tmpSum=0;
bucketOffset[0]=0;
for(i=0;i<sliceSize; i++){
if(sliceCount[i] >BLOCK_SIZE)
{
overflow=true;
}
tmpSum += sliceCount[i];
bucketOfSlice[i]=bucketsCount;
bucketSizes[bucketsCount] = tmpSum;
sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
if(tmpSum > BLOCK_SIZE )
{
if(i != 0)
{
bucketOfSlice[i]=bucketsCount+1;
bucketSizes[bucketsCount] -= sliceCount[i];
sliceOffsetInBucket[i]=0;
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum - sliceCount[i];
bucketsCount++;
tmpSum=sliceCount[i];
bucketSizes[bucketsCount] = tmpSum;
}
else
{
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
sliceOffsetInBucket[i]=0;
tmpSum=0;
bucketsCount++;
}
}
}
bucketsCount++;
return overflow;
}
template <typename T>
void reduceMinMax(T* dData,int size,float& result,bool isMax)
{
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int blockCount;
int length=size;
T originalResult;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
if(isMax)
reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
else
reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
result=(int)getValue(originalResult);
}
template <typename T>
void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
{
int blockCount;
if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
float* dDiffData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
float sum=0;
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int length=size;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
if( sum < (maxValue - minValue) * size / 10)
listOrder=NEARLY_SORTED;
else
listOrder=DISORDERLY;
CUDA_SAFE_CALL(cudaFree(dDiffData));
}
template <typename T>
void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
{
float minValue,maxValue;
T* dTmpData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,maxValue,true);
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,minValue,false);
if(minValue == maxValue)
{
CUDA_SAFE_CALL(cudaFree(dTmpData));
return ;
}
if(listOrder == AUTO_EVALUATE )
{
evaluateDisorder(dData,size,maxValue,minValue,listOrder);
}
float sliceStep = (float) (50.0*((double)(maxValue-minValue)/(double)size));
int sliceSize = (int) ((maxValue-minValue)/sliceStep + 10);
int blockCount;
if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
unsigned int* dSliceCounts;
unsigned int* dOffsetInSlice;
CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
if(listOrder == NEARLY_SORTED)
{
assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
}
else
assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
unsigned int* hSliceCounts=new unsigned int[sliceSize];
CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
int looseBucketSize=size/100;
unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
int bucketsCount=0;
memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
bool overflow;
overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
unsigned int* dBucketOffsets;
unsigned int* dBucketSizes;
unsigned int* dBucketOfSlices;
unsigned int* dSliceOffsetInBucket;
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
cudaBindTexture(0,tBucketSizes,dBucketSizes);
cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
if(overflow){
for(int i=0;i<bucketsCount;i++)
{
if(hBucketSizes[i] > BLOCK_SIZE)
{
bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
}
}
}
delete hBucketOffsets;
delete hBucketOfSlices;
delete hSliceCounts;
delete hBucketSizes;
delete hSliceOffsetInBucket;
CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
CUDA_SAFE_CALL(cudaFree(dSliceCounts));
CUDA_SAFE_CALL(cudaFree(dTmpData));
cudaUnbindTexture( tBucketSizes );
CUDA_SAFE_CALL(cudaFree(dBucketSizes));
cudaUnbindTexture( tBucketOffsets );
CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
cudaUnbindTexture( tBucketOfSlices );
CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
cudaUnbindTexture( tSliceOffsetInBucket );
CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
}
/************************************************************************************
Uncomment your desired function definition here
Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
please follow the readme.txt
Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
*************************************************************************************/
template<>
void OPENMMCUDA_EXPORT bbSort(int2* dData,int size,int listOrder)
{
bbSortBody(dData,size,listOrder);
}
//void bbSort(float* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//void bbSort(int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(unsigned int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(double* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_H_
#define _BBSORT_H_
#include "windowsExportCuda.h"
#define BLOCK_SIZE 512
#define DISORDERLY 0
#define NEARLY_SORTED 1
#define AUTO_EVALUATE 2
template <typename T>
void OPENMMCUDA_EXPORT bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
#endif // _BBSORT_H_
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_KERNEL_H_
#define _BBSORT_KERNEL_H_
#include "bbsort.h"
#include "math_constants.h"
texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
static __device__ int dGetValue(int2 v){
return v.y;
}
template <typename T>
static __device__ T dGetValue(T v){
return v;
}
static __device__ void dPad(int2& v){
v.x=0x3fffffff;
v.y=0x4fffffff;
}
template <typename T>
static __device__ void dPad(T & v){
v=0x7fffffff;
}
template <typename T>
__global__ static void reduceMaxD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
template <typename T>
__global__ static void reduceMinD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
__global__ static void reduceSumD(float * dDiffData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dDiffData[index] += dDiffData[index+step];
}
template <typename T>
__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index > size-1)
return ;
const unsigned int tid = threadIdx.x;
extern __shared__ T sData[];
sData[tid]=dData[index];
__syncthreads();
if(tid < blockDim.x -1)
dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
else
dDiffData[index] =0;
}
template <typename T>
__device__ inline void dSwap(T & a, T & b)
{
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__global__ static void bitonicSortD(T * datas)
{
extern __shared__ T shared[];
const unsigned int bid=blockIdx.x;
const unsigned int tid = threadIdx.x;
__shared__ unsigned int count;
__shared__ unsigned int offset;
if(tid == 0)
{
count=tex1Dfetch(tBucketSizes,bid);
offset=tex1Dfetch(tBucketOffsets,bid);
}
__syncthreads();
if(tid < count)
shared[tid] = datas[tid+offset];
else
{
dPad(shared[tid]);
}
__syncthreads();
for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
{
for (unsigned int j = k / 2; j>0; j /= 2)
{
unsigned int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
else
{
if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
}
__syncthreads();
}
}
if(tid < count)
datas[tid+offset] = shared[tid];
}
template <typename T>
__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
{
unsigned int index= blockIdx.x + blockCount * threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToBucketD(T* dDatas,T* dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int b=tex1Dfetch(tBucketOfSlices,s);
unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
dNewDatas[offset] =dDatas[index];
}
#endif // _BBSORT_KERNEL_H_
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
#include "cudaCompact.h"
typedef unsigned int T;
// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
__device__ unsigned int sumReduce128(volatile unsigned int* arr) {
// Parallel reduce element counts
// Assumes 128 thd/block
if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
__syncthreads();
if (threadIdx.x < 32) {
arr[threadIdx.x] += arr[threadIdx.x+32];
if (threadIdx.x < 16) arr[threadIdx.x] += arr[threadIdx.x+16];
if (threadIdx.x < 8) arr[threadIdx.x] += arr[threadIdx.x+8];
if (threadIdx.x < 4) arr[threadIdx.x] += arr[threadIdx.x+4];
if (threadIdx.x < 2) arr[threadIdx.x] += arr[threadIdx.x+2];
if (threadIdx.x < 1) arr[threadIdx.x] += arr[threadIdx.x+1];
}
__syncthreads();
return arr[0];
}
__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
__shared__ volatile unsigned int dsCount[128];
dsCount[threadIdx.x] = 0;
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub && dgValid[base+threadIdx.x])
dsCount[threadIdx.x]++;
}
__syncthreads();
unsigned int blockCount = sumReduce128(dsCount);
if (threadIdx.x == 0) dgBlockCounts[blockIdx.x] = blockCount;
return;
}
// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
__device__ int exclusivePrescan128(const unsigned int* in,unsigned int* outAndTemp) {
const int n=128;
//TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
//__shared__ int temp[2*n];
unsigned int* temp = outAndTemp;
int pout = 1, pin = 0;
// load input into temp
// This is exclusive scan, so shift right by one and set first elt to 0
temp[pout*n + threadIdx.x] = (threadIdx.x > 0) ? in[threadIdx.x-1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
__syncthreads();
temp[pout*n+threadIdx.x] = temp[pin*n+threadIdx.x];
if (threadIdx.x >= offset)
temp[pout*n+threadIdx.x] += temp[pin*n+threadIdx.x - offset];
}
//out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
__syncthreads();
return outAndTemp[127]+in[127]; // Return sum of all elements
}
__device__ int compactSIMDPrefixSum(const T* dsData,const unsigned int* dsValid,T* dsCompact) {
__shared__ unsigned int dsLocalIndex[256];
int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
if (dsValid[threadIdx.x]) dsCompact[dsLocalIndex[threadIdx.x]] = dsData[threadIdx.x];
return numValid;
}
__global__ void moveValidElementsStaged(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
__shared__ T inBlock[128];
__shared__ unsigned int validBlock[128];
__shared__ T compactBlock[128];
int blockOutOffset=0;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for (int base = 0; base < blockIdx.x; base += blockDim.x) {
// Load up the count of valid elements for each block before us in batches of 128
if ((base + threadIdx.x) < blockIdx.x) {
validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset += sumReduce128(validBlock);
}
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub) {
validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
inBlock[threadIdx.x] = dgData[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
__syncthreads();
if (threadIdx.x < numValidBlock) {
dgCompact[blockOutOffset + threadIdx.x] = compactBlock[threadIdx.x];
}
blockOutOffset += numValidBlock;
}
if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
*dNumValidElements = blockOutOffset;
}
}
__global__ void moveValidElementsScattered(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
__shared__ T inBlock[128];
__shared__ unsigned int validBlock[128];
T* compactBlock=dgCompact;
size_t blockOutOffset = 0;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for (int base = 0; base < blockIdx.x; base += blockDim.x) {
// Load up the count of valid elements for each block before us in batches of 128
if ((base + threadIdx.x) < blockIdx.x) {
validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset += sumReduce128(validBlock);
}
compactBlock += blockOutOffset;
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub) {
validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
inBlock[threadIdx.x] = dgData[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
blockOutOffset += numValidBlock;
compactBlock += numValidBlock;
}
if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
*dNumValidElements = blockOutOffset;
}
}
void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput) {
int device;
cudaGetDevice(&device);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
d.nThreadBlocks = 16*deviceProp.multiProcessorCount;
cudaMalloc((void**)&(d.dgBlockCounts), d.nThreadBlocks*sizeof(unsigned int));
d.stageOutput = stageOutput;
// TODO: make sure allocation worked
d.valid = true;
}
void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d) {
if (d.valid) cudaFree(d.dgBlockCounts);
}
int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,T* dOut,const T* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid) {
if (!d.valid) {
return -1;
}
// Figure out # elements per block
unsigned int numBlocks = d.nThreadBlocks;
if (numBlocks*128 > len)
numBlocks = (len+127)/128;
const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countElts<<<numBlocks,128>>>(d.dgBlockCounts,dValid,eltsPerBlock,len);
// Phase 2/3: Move valid elements using SIMD compaction
if (d.stageOutput) {
moveValidElementsStaged<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
} else {
moveValidElementsScattered<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
}
return 0;
}
#ifndef __OPENMM_CUDACOMPACT_H__
#define __OPENMM_CUDACOMPACT_H__
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
#include "windowsExportCuda.h"
struct compactionPlan {
bool valid;
unsigned int* dgBlockCounts;
unsigned int nThreadBlocks;
bool stageOutput;
};
extern "C"
void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput=true);
extern "C"
void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d);
extern "C"
int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,unsigned int* dOut,const unsigned int* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid);
#endif // __OPENMM_CUDACOMPACT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
// Initialization
extern void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu);
extern void kClearEnergy(gpuContext gpu);
extern void kClearBornSumAndForces(gpuContext gpu);
extern void kClearObcGbsaBornSum(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu);
extern void kCalculateGBVIBornSum(gpuContext gpu);
extern void kReduceGBVIBornSum(gpuContext gpu);
extern void kClearGBVIBornSum( gpuContext gpu );
extern void kGenerateRandoms(gpuContext gpu);
// Main loop
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJGBVIForces1(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps);
extern void kCalculateCustomBondForces(gpuContext gpu);
extern void kCalculateCustomAngleForces(gpuContext gpu);
extern void kCalculateCustomTorsionForces(gpuContext gpu);
extern void kCalculateCustomExternalForces(gpuContext gpu);
extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateGBVIForces2(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyShake(gpuContext gpu);
extern void kApplyCCMA(gpuContext gpu);
extern void kApplySettle(gpuContext gpu);
extern void kLangevinUpdatePart1(gpuContext gpu);
extern void kLangevinUpdatePart2(gpuContext gpu);
extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep);
extern void kSetVelocitiesFromPositions(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu);
extern void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep);
extern void kBrownianUpdatePart1(gpuContext gpu);
extern void kBrownianUpdatePart2(gpuContext gpu);
extern void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex);
extern void kApplyConstraints(gpuContext gpu);
// Extras
extern void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu);
extern double kReduceEnergy(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCustomBondForcesSim(gpuContext gpu);
extern void GetCalculateCustomBondForcesSim(gpuContext gpu);
extern void SetCalculateCustomAngleForcesSim(gpuContext gpu);
extern void GetCalculateCustomAngleForcesSim(gpuContext gpu);
extern void SetCalculateCustomTorsionForcesSim(gpuContext gpu);
extern void GetCalculateCustomTorsionForcesSim(gpuContext gpu);
extern void SetCalculateCustomExternalForcesSim(gpuContext gpu);
extern void GetCalculateCustomExternalForcesSim(gpuContext gpu);
extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void SetCalculateGBVIBornSumSim(gpuContext gpu);
extern void GetCalculateGBVIBornSumSim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateGBVIForces2Sim(gpuContext gpu);
extern void GetCalculateGBVIForces2Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetCalculatePMESim(gpuContext gpu);
extern void GetCalculatePMESim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu);
extern void SetShakeHSim(gpuContext gpu);
extern void GetShakeHSim(gpuContext gpu);
extern void SetLangevinUpdateSim(gpuContext gpu);
extern void GetLangevinUpdateSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetCCMASim(gpuContext gpu);
extern void GetCCMASim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu);
extern void GetBrownianUpdateSim(gpuContext gpu);
extern void SetRandomSim(gpuContext gpu);
extern void GetRandomSim(gpuContext gpu);
extern void SetCustomBondForceExpression(const Expression<256>& expression);
extern void SetCustomBondEnergyExpression(const Expression<256>& expression);
extern void SetCustomBondGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomAngleForceExpression(const Expression<256>& expression);
extern void SetCustomAngleEnergyExpression(const Expression<256>& expression);
extern void SetCustomAngleGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomTorsionForceExpression(const Expression<256>& expression);
extern void SetCustomTorsionEnergyExpression(const Expression<256>& expression);
extern void SetCustomTorsionGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ);
extern void SetCustomExternalEnergyExpression(const Expression<256>& expression);
extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
extern void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log);
extern void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment