Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
abb8cb4b
Commit
abb8cb4b
authored
Jun 04, 2012
by
Peter Eastman
Browse files
Very early beginnings of new CUDA platform
parent
18501459
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4779 additions
and
0 deletions
+4779
-0
platforms/cuda2/CMakeLists.txt
platforms/cuda2/CMakeLists.txt
+102
-0
platforms/cuda2/EncodeCUDAFiles.cmake
platforms/cuda2/EncodeCUDAFiles.cmake
+27
-0
platforms/cuda2/include/CudaKernelFactory.h
platforms/cuda2/include/CudaKernelFactory.h
+45
-0
platforms/cuda2/include/CudaPlatform.h
platforms/cuda2/include/CudaPlatform.h
+112
-0
platforms/cuda2/sharedTarget/CMakeLists.txt
platforms/cuda2/sharedTarget/CMakeLists.txt
+24
-0
platforms/cuda2/src/CudaArray.cpp
platforms/cuda2/src/CudaArray.cpp
+79
-0
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+134
-0
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+972
-0
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+531
-0
platforms/cuda2/src/CudaExpressionUtilities.cpp
platforms/cuda2/src/CudaExpressionUtilities.cpp
+369
-0
platforms/cuda2/src/CudaExpressionUtilities.h
platforms/cuda2/src/CudaExpressionUtilities.h
+126
-0
platforms/cuda2/src/CudaForceInfo.cpp
platforms/cuda2/src/CudaForceInfo.cpp
+46
-0
platforms/cuda2/src/CudaForceInfo.h
platforms/cuda2/src/CudaForceInfo.h
+72
-0
platforms/cuda2/src/CudaKernelFactory.cpp
platforms/cuda2/src/CudaKernelFactory.cpp
+129
-0
platforms/cuda2/src/CudaKernelSources.cpp.in
platforms/cuda2/src/CudaKernelSources.cpp.in
+31
-0
platforms/cuda2/src/CudaKernelSources.h.in
platforms/cuda2/src/CudaKernelSources.h.in
+48
-0
platforms/cuda2/src/CudaKernels.h
platforms/cuda2/src/CudaKernels.h
+1197
-0
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+176
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+102
-0
platforms/cuda2/src/kernels/vectorOps.cu
platforms/cuda2/src/kernels/vectorOps.cu
+457
-0
No files found.
platforms/cuda2/CMakeLists.txt
0 → 100644
View file @
abb8cb4b
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCUDA.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCUDA[_d].dll
# OpenMMCUDA[_d].lib
# OpenMMCUDA_static[_d].lib
# Unix:
# libOpenMMCUDA[_d].so
# libOpenMMCUDA_static[_d].a
#----------------------------------------------------
IF
(
APPLE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.6"
)
SET
(
CMAKE_OSX_SYSROOT
"/Developer/SDKs/MacOSX10.6.sdk"
)
ENDIF
(
APPLE
)
set
(
OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL
"Whether to build CUDA test cases"
)
if
(
OPENMM_BUILD_CUDA_TESTS
)
SUBDIRS
(
tests
)
endif
(
OPENMM_BUILD_CUDA_TESTS
)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET
(
OPENMM_SOURCE_SUBDIRS .
)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET
(
OPENMMCUDA_LIBRARY_NAME OpenMMCUDA
)
SET
(
SHARED_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
)
SET
(
STATIC_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
_static
)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
SET
(
CMAKE_DEBUG_POSTFIX
"_d"
CACHE INTERNAL
""
FORCE
)
ENDIF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
# But on Unix or Cygwin we have to add the suffix manually
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
SHARED_TARGET
${
SHARED_TARGET
}
_d
)
SET
(
STATIC_TARGET
${
STATIC_TARGET
}
_d
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
# These are all the places to search for header files which are
# to be part of the API.
SET
(
API_INCLUDE_DIRS
)
# start empty
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
# append
SET
(
API_INCLUDE_DIRS
${
API_INCLUDE_DIRS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include/internal
)
ENDFOREACH
(
subdir
)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET
(
API_REL_INCLUDE_FILES
)
# start these out empty
SET
(
API_ABS_INCLUDE_FILES
)
FOREACH
(
dir
${
API_INCLUDE_DIRS
}
)
FILE
(
GLOB fullpaths
${
dir
}
/*.h
)
# returns full pathnames
SET
(
API_ABS_INCLUDE_FILES
${
API_ABS_INCLUDE_FILES
}
${
fullpaths
}
)
FOREACH
(
pathname
${
fullpaths
}
)
GET_FILENAME_COMPONENT
(
filename
${
pathname
}
NAME
)
SET
(
API_REL_INCLUDE_FILES
${
API_REL_INCLUDE_FILES
}
${
dir
}
/
${
filename
}
)
ENDFOREACH
(
pathname
)
ENDFOREACH
(
dir
)
# collect up source files
SET
(
SOURCE_FILES
)
# empty
SET
(
SOURCE_INCLUDE_FILES
)
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
FILE
(
GLOB_RECURSE src_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.cpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.c
)
FILE
(
GLOB incl_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.h
)
SET
(
SOURCE_FILES
${
SOURCE_FILES
}
${
src_files
}
)
#append
SET
(
SOURCE_INCLUDE_FILES
${
SOURCE_INCLUDE_FILES
}
${
incl_files
}
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
# Set variables needed for encoding kernel sources into a C++ class
SET
(
CUDA_SOURCE_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
SET
(
CUDA_SOURCE_CLASS CudaKernelSources
)
SET
(
CUDA_KERNELS_CPP
${
CMAKE_CURRENT_BINARY_DIR
}
/src/
${
CUDA_SOURCE_CLASS
}
.cpp
)
SET
(
CUDA_KERNELS_H
${
CMAKE_CURRENT_BINARY_DIR
}
/src/
${
CUDA_SOURCE_CLASS
}
.h
)
SET
(
SOURCE_FILES
${
SOURCE_FILES
}
${
CUDA_KERNELS_CPP
}
${
CUDA_KERNELS_H
}
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_BINARY_DIR
}
/src
)
SUBDIRS
(
sharedTarget
)
platforms/cuda2/EncodeCUDAFiles.cmake
0 → 100644
View file @
abb8cb4b
FILE
(
GLOB CUDA_KERNELS
${
CUDA_SOURCE_DIR
}
/kernels/*.cu
)
SET
(
CUDA_FILE_DECLARATIONS
)
SET
(
CUDA_FILE_DEFINITIONS
)
CONFIGURE_FILE
(
${
CUDA_SOURCE_DIR
}
/
${
CUDA_SOURCE_CLASS
}
.cpp.in
${
CUDA_KERNELS_CPP
}
)
FOREACH
(
file
${
CUDA_KERNELS
}
)
# Load the file contents and process it.
FILE
(
STRINGS
${
file
}
file_content NEWLINE_CONSUME
)
# Replace all backslashes by double backslashes as they are being put in a C string.
# Be careful not to replace the backslash before a semicolon as that is the CMAKE
# internal escaping of a semicolon to prevent it from acting as a list seperator.
STRING
(
REGEX REPLACE
"
\\\\
([^;])"
"
\\\\\\\\\\
1"
file_content
"
${
file_content
}
"
)
# Escape double quotes as being put in a C string.
STRING
(
REPLACE
"
\"
"
"
\\\"
"
file_content
"
${
file_content
}
"
)
# Split in separate C strings for each line.
STRING
(
REPLACE
"
\n
"
"
\\
n
\"\n\"
"
file_content
"
${
file_content
}
"
)
# Determine a name for the variable that will contain this file's contents
FILE
(
RELATIVE_PATH filename
${
CUDA_SOURCE_DIR
}
/kernels
${
file
}
)
STRING
(
LENGTH
${
filename
}
filename_length
)
MATH
(
EXPR filename_length
${
filename_length
}
-3
)
STRING
(
SUBSTRING
${
filename
}
0
${
filename_length
}
variable_name
)
# Record the variable declaration and definition.
SET
(
CUDA_FILE_DECLARATIONS
${
CUDA_FILE_DECLARATIONS
}
static\ const\ std::string\
${
variable_name
}
;\n
)
FILE
(
APPEND
${
CUDA_KERNELS_CPP
}
const\ string\
${
CUDA_SOURCE_CLASS
}
::
${
variable_name
}
\ =\ \"
${
file_content
}
\"\;\n
)
ENDFOREACH
(
file
)
CONFIGURE_FILE
(
${
CUDA_SOURCE_DIR
}
/
${
CUDA_SOURCE_CLASS
}
.h.in
${
CUDA_KERNELS_H
}
)
platforms/cuda2/include/CudaKernelFactory.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAKERNELFACTORY_H_
#define OPENMM_CUDAKERNELFACTORY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/KernelFactory.h"
namespace
OpenMM
{
/**
* This KernelFactory creates all kernels for CudaPlatform.
*/
class
CudaKernelFactory
:
public
KernelFactory
{
public:
KernelImpl
*
createKernelImpl
(
std
::
string
name
,
const
Platform
&
platform
,
ContextImpl
&
context
)
const
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAKERNELFACTORY_H_*/
platforms/cuda2/include/CudaPlatform.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAPLATFORM_H_
#define OPENMM_CUDAPLATFORM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Platform.h"
#include "openmm/System.h"
namespace
OpenMM
{
class
CudaContext
;
/**
* This Platform subclass uses CUDA implementations of the OpenMM kernels.
*/
class
OPENMM_EXPORT
CudaPlatform
:
public
Platform
{
public:
class
PlatformData
;
CudaPlatform
();
const
std
::
string
&
getName
()
const
{
static
const
std
::
string
name
=
"CUDA"
;
return
name
;
}
double
getSpeed
()
const
{
return
100
;
}
bool
supportsDoublePrecision
()
const
;
const
std
::
string
&
getPropertyValue
(
const
Context
&
context
,
const
std
::
string
&
property
)
const
;
void
setPropertyValue
(
Context
&
context
,
const
std
::
string
&
property
,
const
std
::
string
&
value
)
const
;
void
contextCreated
(
ContextImpl
&
context
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
properties
)
const
;
void
contextDestroyed
(
ContextImpl
&
context
)
const
;
/**
* This is the name of the parameter for selecting which CUDA device or devices to use.
*/
static
const
std
::
string
&
CudaDeviceIndex
()
{
static
const
std
::
string
key
=
"CudaDeviceIndex"
;
return
key
;
}
/**
* This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
*/
static
const
std
::
string
&
CudaUseBlockingSync
()
{
static
const
std
::
string
key
=
"CudaUseBlockingSync"
;
return
key
;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static
const
std
::
string
&
CudaPrecision
()
{
static
const
std
::
string
key
=
"CudaPrecision"
;
return
key
;
}
/**
* This is the name of the parameter for specifying the path to the CUDA compiler.
*/
static
const
std
::
string
&
CudaCompiler
()
{
static
const
std
::
string
key
=
"CudaCompiler"
;
return
key
;
}
/**
* This is the name of the parameter for specifying the path to the directory for creating temporary files.
*/
static
const
std
::
string
&
CudaTempDirectory
()
{
static
const
std
::
string
key
=
"CudaTempDirectory"
;
return
key
;
}
};
class
OPENMM_EXPORT
CudaPlatform
::
PlatformData
{
public:
PlatformData
(
const
System
&
system
,
const
std
::
string
&
deviceIndexProperty
,
const
std
::
string
&
blockingProperty
,
const
std
::
string
&
precisionProperty
,
const
std
::
string
&
compilerProperty
,
const
std
::
string
&
tempProperty
);
~
PlatformData
();
void
initializeContexts
(
const
System
&
system
);
void
syncContexts
();
std
::
vector
<
CudaContext
*>
contexts
;
std
::
vector
<
double
>
contextEnergy
;
bool
removeCM
;
int
cmMotionFrequency
;
int
stepCount
,
computeForceCount
;
double
time
;
std
::
map
<
std
::
string
,
std
::
string
>
propertyValues
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAPLATFORM_H_*/
platforms/cuda2/sharedTarget/CMakeLists.txt
0 → 100644
View file @
abb8cb4b
#
# Include CUDA related files.
#
INCLUDE
(
FindCUDA
)
INCLUDE_DIRECTORIES
(
${
CUDA_INCLUDE_DIRS
}
)
FILE
(
GLOB CUDA_KERNELS
${
CUDA_SOURCE_DIR
}
/kernels/*.cu
)
ADD_CUSTOM_COMMAND
(
OUTPUT
${
CUDA_KERNELS_CPP
}
${
CUDA_KERNELS_H
}
COMMAND
${
CMAKE_COMMAND
}
ARGS -D CUDA_SOURCE_DIR=
${
CUDA_SOURCE_DIR
}
-D CUDA_KERNELS_CPP=
${
CUDA_KERNELS_CPP
}
-D CUDA_KERNELS_H=
${
CUDA_KERNELS_H
}
-D CUDA_SOURCE_CLASS=
${
CUDA_SOURCE_CLASS
}
-P
${
CMAKE_CURRENT_SOURCE_DIR
}
/../EncodeCUDAFiles.cmake
DEPENDS
${
CUDA_KERNELS
}
)
SET_SOURCE_FILES_PROPERTIES
(
${
CUDA_KERNELS_CPP
}
${
CUDA_KERNELS_H
}
PROPERTIES GENERATED TRUE
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
_d
)
ELSE
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
CUDA_LIBRARIES
}
${
PTHREADS_LIB
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES COMPILE_FLAGS
"-DOPENMM_BUILDING_SHARED_LIBRARY"
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
platforms/cuda2/src/CudaArray.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include <iostream>
#include <sstream>
#include <vector>
using
namespace
OpenMM
;
CudaArray
::
CudaArray
(
int
size
,
int
elementSize
,
const
std
::
string
&
name
)
:
size
(
size
),
elementSize
(
elementSize
),
name
(
name
),
ownsMemory
(
true
)
{
CUresult
result
=
cuMemAlloc
(
&
pointer
,
size
*
elementSize
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error creating array "
<<
name
<<
": "
<<
result
;
throw
OpenMMException
(
str
.
str
());
}
}
CudaArray
::~
CudaArray
()
{
if
(
ownsMemory
)
{
CUresult
result
=
cuMemFree
(
pointer
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error deleting array "
<<
name
<<
": "
<<
result
;
throw
OpenMMException
(
str
.
str
());
}
}
}
void
CudaArray
::
upload
(
void
*
data
,
bool
blocking
)
{
CUresult
result
;
if
(
blocking
)
result
=
cuMemcpyHtoD
(
pointer
,
data
,
size
*
elementSize
);
else
result
=
cuMemcpyHtoDAsync
(
pointer
,
data
,
size
*
elementSize
,
0
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error uploading array "
<<
name
<<
": "
<<
result
;
throw
OpenMMException
(
str
.
str
());
}
}
void
CudaArray
::
download
(
void
*
data
,
bool
blocking
)
const
{
CUresult
result
;
if
(
blocking
)
result
=
cuMemcpyDtoH
(
data
,
pointer
,
size
*
elementSize
);
else
result
=
cuMemcpyDtoHAsync
(
data
,
pointer
,
size
*
elementSize
,
0
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error downloading array "
<<
name
<<
": "
<<
result
;
throw
OpenMMException
(
str
.
str
());
}
}
platforms/cuda2/src/CudaArray.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAARRAY_H_
#define OPENMM_CUDAARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include <cuda.h>
#include <iostream>
#include <sstream>
#include <vector>
namespace
OpenMM
{
/**
* This class encapsulates a block of CUDA device memory. It provides a simplified API
* for working with it and for copying data to and from device memory.
*/
class
CudaArray
{
public:
/**
* Create a CudaArray object. The object is allocated on the heap with the "new" operator.
* The template argument is the data type of each array element.
*
* @param size the number of elements in the array
* @param name the name of the array
*/
template
<
class
T
>
static
CudaArray
*
create
(
int
size
,
const
std
::
string
&
name
)
{
return
new
CudaArray
(
size
,
sizeof
(
T
),
name
);
}
/**
* Create a CudaArray object.
*
* @param size the number of elements in the array
* @param elementSize the size of each element in bytes
* @param name the name of the array
*/
CudaArray
(
int
size
,
int
elementSize
,
const
std
::
string
&
name
);
~
CudaArray
();
/**
* Get the number of elements in the array.
*/
int
getSize
()
const
{
return
size
;
}
/**
* Get the size of each element in bytes.
*/
int
getElementSize
()
const
{
return
elementSize
;
}
/**
* Get the name of the array.
*/
const
std
::
string
&
getName
()
const
{
return
name
;
}
/**
* Get a pointer to the device memory.
*/
CUdeviceptr
getDevicePointer
()
{
return
pointer
;
}
/**
* Copy the values in a vector to the device memory.
*/
template
<
class
T
>
void
upload
(
std
::
vector
<
T
>&
data
)
{
if
(
sizeof
(
T
)
!=
elementSize
||
data
.
size
()
!=
size
)
throw
OpenMMException
(
"Error uploading array "
+
name
+
": The specified vector does not match the size of the array"
);
upload
(
&
data
[
0
],
true
);
}
/**
* Copy the values in the Buffer to a vector.
*/
template
<
class
T
>
void
download
(
std
::
vector
<
T
>&
data
)
const
{
if
(
sizeof
(
T
)
!=
elementSize
)
throw
OpenMMException
(
"Error downloading array "
+
name
+
": The specified vector has the wrong element size"
);
if
(
data
.
size
()
!=
size
)
data
.
resize
(
size
);
download
(
&
data
[
0
],
true
);
}
/**
* Copy the values in an array to the device memory.
*
* @param data the data to copy
* @param blocking if true, this call will block until the transfer is complete. If false,
* the source array must be in page-locked memory.
*/
void
upload
(
void
*
data
,
bool
blocking
=
true
);
/**
* Copy the values in the device memory to an array.
*
* @param data the array to copy the memory to
* @param blocking if true, this call will block until the transfer is complete. If false,
* the destination array must be in page-locked memory.
*/
void
download
(
void
*
data
,
bool
blocking
=
true
)
const
;
private:
CUdeviceptr
pointer
;
int
size
,
elementSize
;
bool
ownsMemory
;
std
::
string
name
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAARRAY_H_*/
platforms/cuda2/src/CudaContext.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include <cmath>
#include "CudaContext.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
//#include "CudaNonbondedUtilities.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <typeinfo>
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \
throw OpenMMException(m.str());\
}
using
namespace
OpenMM
;
using
namespace
std
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
TileSize
=
32
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
const
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
compiler
(
compiler
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
posq
(
NULL
),
velm
(
NULL
),
/*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/
thread
(
NULL
)
{
if
(
!
hasInitializedCuda
)
{
CHECK_RESULT2
(
cuInit
(
0
),
"Error initializing CUDA"
);
hasInitializedCuda
=
true
;
}
if
(
precision
==
"single"
)
{
useDoublePrecision
=
false
;
accumulateInDouble
=
false
;
}
else
if
(
precision
==
"mixed"
)
{
useDoublePrecision
=
false
;
accumulateInDouble
=
true
;
}
else
if
(
precision
==
"double"
)
{
useDoublePrecision
=
true
;
accumulateInDouble
=
true
;
}
else
throw
OpenMMException
(
"Illegal value for CudaPrecision: "
+
precision
);
#ifdef WIN32
this
->
tempDir
=
tempDir
+
"
\"
;
#else
this->tempDir = tempDir+"
/
";
#endif
contextIndex = platformData.contexts.size();
int numDevices;
string errorMessage = "
Error
initializing
Context
";
CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < 0 || deviceIndex >= numDevices) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestCompute = -1;
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i));
int major, minor, clock, multiprocessors;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
if (major == 1 && minor < 2)
continue; // 1.0 and 1.1 are not supported
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int speed = clock*multiprocessors;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
}
}
}
if (deviceIndex == -1)
throw OpenMMException("
No
compatible
CUDA
device
is
available
");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["
WORK_GROUP_SIZE
"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "
--
use_fast_math
";
int numThreadBlocksPerComputeUnit = 6;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
contextIsValid = true;
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
posq = CudaArray::create<float4>(paddedNumAtoms, "
posq
");
velm = CudaArray::create<float4>(paddedNumAtoms, "
velm
");
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
// Create utility kernels that are used in multiple places.
CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
cuModuleGetFunction(&clearBufferKernel, utilities, "
clearBuffer
");
cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "
clearTwoBuffers
");
cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "
clearThreeBuffers
");
cuModuleGetFunction(&clearFourBuffersKernel, utilities, "
clearFourBuffers
");
cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "
clearFiveBuffers
");
cuModuleGetFunction(&clearSixBuffersKernel, utilities, "
clearSixBuffers
");
cuModuleGetFunction(&reduceFloat4Kernel, utilities, "
reduceFloat4Buffer
");
cuModuleGetFunction(&reduceForcesKernel, utilities, "
reduceForces
");
// Set defines based on the requested precision.
compilationDefines["
SQRT
"] = useDoublePrecision ? "
sqrt
" : "
sqrtf
";
compilationDefines["
RSQRT
"] = useDoublePrecision ? "
rsqrt
" : "
rsqrtf
";
compilationDefines["
RECIP
"] = useDoublePrecision ? "
1.0
/
" : "
1.0
f
/
";
compilationDefines["
EXP
"] = useDoublePrecision ? "
exp
" : "
expf
";
compilationDefines["
LOG
"] = useDoublePrecision ? "
log
" : "
logf
";
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
//
// // Create the integration utilities object.
//
// integration = new CudaIntegrationUtilities(*this, system);
}
CudaContext::~CudaContext() {
for (int i = 0; i < (int) forces.size(); i++)
delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i];
if (posq != NULL)
delete posq;
if (velm != NULL)
delete velm;
// if (force != NULL)
// delete force;
// if (forceBuffers != NULL)
// delete forceBuffers;
// if (longForceBuffer != NULL)
// delete longForceBuffer;
// if (energyBuffer != NULL)
// delete energyBuffer;
// if (atomIndex != NULL)
// delete atomIndex;
// if (integration != NULL)
// delete integration;
// if (bonded != NULL)
// delete bonded;
// if (nonbonded != NULL)
// delete nonbonded;
if (thread != NULL)
delete thread;
string errorMessage = "
Error
deleting
Context
";
if (contextIsValid)
CHECK_RESULT(cuCtxDestroy(context));
}
//void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
// }
// velm->upload();
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
// for (int i = 0; i < (int) forces.size(); i++)
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "
forceBuffers
", false);
// if (supports64BitGlobalAtomics) {
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "
longForceBuffer
", false);
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "
force
", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "
energyBuffer
", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "
atomIndex
", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
// nonbonded->initialize(system);
//}
void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force);
}
string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
string result = input;
for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
int index = -1;
do {
index = result.find(iter->first);
if (index != result.npos)
result.replace(index, iter->first.size(), iter->second);
} while (index != result.npos);
}
return result;
}
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
return createModule(source, map<string, string>(), optimizationFlags);
}
CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "
// Compilation Options: " << options << endl << endl;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
compilationDefines
.
begin
();
iter
!=
compilationDefines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
src
<<
" "
<<
iter
->
second
;
src
<<
endl
;
}
if
(
!
compilationDefines
.
empty
())
src
<<
endl
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
src
<<
" "
<<
iter
->
second
;
src
<<
endl
;
}
if
(
!
defines
.
empty
())
src
<<
endl
;
src
<<
source
<<
endl
;
// Write out the source to a temporary file.
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
string
inputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".cu"
);
string
outputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".ptx"
);
string
logFile
=
(
tempDir
+
tempFileName
.
str
()
+
".log"
);
ofstream
out
(
inputFile
.
c_str
());
out
<<
src
.
str
();
out
.
close
();
#ifdef WIN32
#else
string
command
=
"
\"
"
+
compiler
+
"
\"
--ptx -arch=compute_"
+
gpuArchitecture
+
" -o
\"
"
+
outputFile
+
"
\"
"
+
options
+
"
\"
"
+
inputFile
+
"
\"
2>
\"
"
+
logFile
+
"
\"
"
;
int
res
=
std
::
system
(
command
.
c_str
());
#endif
try
{
if
(
res
!=
0
)
{
// Load the error log.
stringstream
error
;
error
<<
"Error launching CUDA compiler: "
<<
res
;
ifstream
log
(
logFile
.
c_str
());
if
(
log
.
is_open
())
{
string
line
;
while
(
!
log
.
eof
())
{
getline
(
log
,
line
);
error
<<
'\n'
<<
line
;
}
log
.
close
();
}
throw
OpenMMException
(
error
.
str
());
}
CUmodule
module
;
CUresult
result
=
cuModuleLoad
(
&
module
,
outputFile
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
m
<<
"Error loading CUDA module: "
<<
result
;
throw
OpenMMException
(
m
.
str
());
}
remove
(
inputFile
.
c_str
());
remove
(
outputFile
.
c_str
());
remove
(
logFile
.
c_str
());
return
module
;
}
catch
(...)
{
remove
(
inputFile
.
c_str
());
remove
(
outputFile
.
c_str
());
remove
(
logFile
.
c_str
());
throw
;
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
// if (blockSize == -1)
// blockSize = ThreadBlockSize;
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
// try {
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
// }
// catch (cl::Error err) {
// stringstream str;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
// throw OpenMMException(str.str());
// }
//}
//
//void CudaContext::clearBuffer(CudaArray<float>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize());
//}
//
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
//
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
// clearBufferKernel.setArg<cl::Memory>(0, memory);
// clearBufferKernel.setArg<cl_int>(1, size);
// executeKernel(clearBufferKernel, size, 128);
//}
//
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
// autoclearBuffers.push_back(&memory);
// autoclearBufferSizes.push_back(size);
//}
//
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int total = autoclearBufferSizes.size();
// while (total-base >= 6) {
// clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
// clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
// clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
// clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
// executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
// base += 6;
// }
// if (total-base == 5) {
// clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
// clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
// executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
// }
// else if (total-base == 4) {
// clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
// }
// else if (total-base == 3) {
// clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
// }
// else if (total-base == 2) {
// clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
// }
// else if (total-base == 1) {
// clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
// }
//}
//
//void CudaContext::reduceForces() {
// if (supports64BitGlobalAtomics)
// executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
// else
// reduceBuffer(*forceBuffers, numForceBuffers);
//}
//
//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
// int bufferSize = array.getSize()/numBuffers;
// reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
// reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
// reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//}
//
///**
// * This class ensures that atom reordering doesn't break virtual sites.
// */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles;
// particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles);
// vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2));
// }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross());
// }
// siteWeights.push_back(weights);
// }
// }
// }
// int getNumParticleGroups() {
// return siteTypes.size();
// }
// void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index];
// }
// bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2])
// return false;
// int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles)
// return false;
// for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i])
// return false;
// return true;
// }
//private:
// vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights;
//};
//
//
//void CudaContext::findMoleculeGroups() {
// // The first time this is called, we need to identify all the molecules in the system.
//
// if (moleculeGroups.size() == 0) {
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
// addForce(new VirtualSiteInfo(system));
//
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
// vector<vector<int> > atomBonds(system.getNumParticles());
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// atomBonds[particle1].push_back(particle2);
// atomBonds[particle2].push_back(particle1);
// }
// for (int i = 0; i < (int) forces.size(); i++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// for (int k = 0; k < (int) particles.size(); k++)
// for (int m = 0; m < (int) particles.size(); m++)
// if (k != m)
// atomBonds[particles[k]].push_back(particles[m]);
// }
// }
//
// // Now tag atoms by which molecule they belong to.
//
// vector<int> atomMolecule(numAtoms, -1);
// int numMolecules = 0;
// for (int i = 0; i < numAtoms; i++)
// if (atomMolecule[i] == -1)
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// vector<vector<int> > atomIndices(numMolecules);
// for (int i = 0; i < numAtoms; i++)
// atomIndices[atomMolecule[i]].push_back(i);
//
// // Construct a description of each molecule.
//
// molecules.resize(numMolecules);
// for (int i = 0; i < numMolecules; i++) {
// molecules[i].atoms = atomIndices[i];
// molecules[i].groups.resize(forces.size());
// }
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
// }
// for (int i = 0; i < (int) forces.size(); i++)
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// }
// }
//
// // Sort them into groups of identical molecules.
//
// vector<Molecule> uniqueMolecules;
// vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeOffsets;
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// Molecule& mol = molecules[molIndex];
//
// // See if it is identical to another molecule.
//
// bool isNew = true;
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// Molecule& mol2 = uniqueMolecules[j];
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// // See if the atoms are identical.
//
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// identical = false;
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// identical = false;
// }
//
// // See if the constraints are identical.
//
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// int c1particle1, c1particle2, c2particle1, c2particle2;
// double distance1, distance2;
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// identical = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// if (mol.groups[i].size() != mol2.groups[i].size())
// identical = false;
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// identical = false;
// }
// if (identical) {
// moleculeInstances[j].push_back(molIndex);
// moleculeOffsets[j].push_back(mol.atoms[0]);
// isNew = false;
// }
// }
// if (isNew) {
// uniqueMolecules.push_back(mol);
// moleculeInstances.push_back(vector<int>());
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeOffsets.push_back(vector<int>());
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// }
// }
// moleculeGroups.resize(moleculeInstances.size());
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
// {
// moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].offsets = moleculeOffsets[i];
// vector<int>& atoms = uniqueMolecules[i].atoms;
// moleculeGroups[i].atoms.resize(atoms.size());
// for (int j = 0; j < (int) atoms.size(); j++)
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// }
//}
//
//void CudaContext::invalidateMolecules() {
// moleculesInvalid = true;
//}
//
//
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return;
// bool valid = true;
// for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
// MoleculeGroup& mol = moleculeGroups[group];
// vector<int>& instances = mol.instances;
// vector<int>& offsets = mol.offsets;
// vector<int>& atoms = mol.atoms;
// int numMolecules = instances.size();
// Molecule& m1 = molecules[instances[0]];
// int offset1 = offsets[0];
// for (int j = 1; valid && j < numMolecules; j++) {
// // See if the atoms are identical.
//
// Molecule& m2 = molecules[instances[j]];
// int offset2 = offsets[j];
// for (int i = 0; i < (int) atoms.size() && valid; i++) {
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
// valid = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && valid; i++) {
// for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
// if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
// valid = false;
// }
// }
// }
// if (valid)
// return;
//
// // The list of which molecules are identical is no longer valid. We need to restore the
// // atoms to their original order, rebuild the list of identical molecules, and sort them
// // again.
//
// vector<mm_float4> newPosq(numAtoms);
// vector<mm_float4> newVelm(numAtoms);
// vector<mm_int4> newCellOffsets(numAtoms);
// posq->download();
// velm->download();
// for (int i = 0; i < numAtoms; i++) {
// int index = atomIndex->get(i);
// newPosq[index] = posq->get(i);
// newVelm[index] = velm->get(i);
// newCellOffsets[index] = posCellOffsets[i];
// }
// for (int i = 0; i < numAtoms; i++) {
// posq->set(i, newPosq[i]);
// velm->set(i, newVelm[i]);
// atomIndex->set(i, i);
// posCellOffsets[i] = newCellOffsets[i];
// }
// posq->upload();
// velm->upload();
// atomIndex->upload();
// findMoleculeGroups();
// for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute();
//}
//
//void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return;
// if (moleculesInvalid)
// validateMolecules();
// atomsWereReordered = true;
//
// // Find the range of positions and the number of bins along each axis.
//
// posq->download();
// velm->download();
// float minx = posq->get(0).x, maxx = posq->get(0).x;
// float miny = posq->get(0).y, maxy = posq->get(0).y;
// float minz = posq->get(0).z, maxz = posq->get(0).z;
// if (nonbonded->getUsePeriodic()) {
// minx = miny = minz = 0.0;
// maxx = periodicBoxSize.x;
// maxy = periodicBoxSize.y;
// maxz = periodicBoxSize.z;
// }
// else {
// for (int i = 1; i < numAtoms; i++) {
// const mm_float4& pos = posq->get(i);
// minx = min(minx, pos.x);
// maxx = max(maxx, pos.x);
// miny = min(miny, pos.y);
// maxy = max(maxy, pos.y);
// minz = min(minz, pos.z);
// maxz = max(maxz, pos.z);
// }
// }
//
// // Loop over each group of identical molecules and reorder them.
//
// vector<int> originalIndex(numAtoms);
// vector<mm_float4> newPosq(numAtoms);
// vector<mm_float4> newVelm(numAtoms);
// vector<mm_int4> newCellOffsets(numAtoms);
// for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// // Find the center of each molecule.
//
// MoleculeGroup& mol = moleculeGroups[group];
// int numMolecules = mol.offsets.size();
// vector<int>& atoms = mol.atoms;
// vector<mm_float4> molPos(numMolecules);
// float invNumAtoms = 1.0f/atoms.size();
// for (int i = 0; i < numMolecules; i++) {
// molPos[i].x = 0.0f;
// molPos[i].y = 0.0f;
// molPos[i].z = 0.0f;
// for (int j = 0; j < (int)atoms.size(); j++) {
// int atom = atoms[j]+mol.offsets[i];
// const mm_float4& pos = posq->get(atom);
// molPos[i].x += pos.x;
// molPos[i].y += pos.y;
// molPos[i].z += pos.z;
// }
// molPos[i].x *= invNumAtoms;
// molPos[i].y *= invNumAtoms;
// molPos[i].z *= invNumAtoms;
// }
// if (nonbonded->getUsePeriodic()) {
// // Move each molecule position into the same box.
//
// for (int i = 0; i < numMolecules; i++) {
// int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
// int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
// int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
// float dx = xcell*periodicBoxSize.x;
// float dy = ycell*periodicBoxSize.y;
// float dz = zcell*periodicBoxSize.z;
// if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
// molPos[i].x -= dx;
// molPos[i].y -= dy;
// molPos[i].z -= dz;
// if (enforcePeriodic) {
// for (int j = 0; j < (int) atoms.size(); j++) {
// int atom = atoms[j]+mol.offsets[i];
// mm_float4 p = posq->get(atom);
// p.x -= dx;
// p.y -= dy;
// p.z -= dz;
// posq->set(atom, p);
// posCellOffsets[atom].x -= xcell;
// posCellOffsets[atom].y -= ycell;
// posCellOffsets[atom].z -= zcell;
// }
// }
// }
// }
// }
//
// // Select a bin for each molecule, then sort them by bin.
//
// bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
// float binWidth;
// if (useHilbert)
// binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
// else
// binWidth = (float)(0.2*nonbonded->getCutoffDistance());
// float invBinWidth = 1.0f/binWidth;
// int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
// int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
// vector<pair<int, int> > molBins(numMolecules);
// bitmask_t coords[3];
// for (int i = 0; i < numMolecules; i++) {
// int x = (int) ((molPos[i].x-minx)*invBinWidth);
// int y = (int) ((molPos[i].y-miny)*invBinWidth);
// int z = (int) ((molPos[i].z-minz)*invBinWidth);
// int bin;
// if (useHilbert) {
// coords[0] = x;
// coords[1] = y;
// coords[2] = z;
// bin = (int) hilbert_c2i(3, 8, coords);
// }
// else {
// int yodd = y&1;
// int zodd = z&1;
// bin = z*xbins*ybins;
// bin += (zodd ? ybins-y : y)*xbins;
// bin += (yodd ? xbins-x : x);
// }
// molBins[i] = pair<int, int>(bin, i);
// }
// sort(molBins.begin(), molBins.end());
//
// // Reorder the atoms.
//
// for (int i = 0; i < numMolecules; i++) {
// for (int j = 0; j < (int)atoms.size(); j++) {
// int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
// int newIndex = mol.offsets[i]+atoms[j];
// originalIndex[newIndex] = atomIndex->get(oldIndex);
// newPosq[newIndex] = posq->get(oldIndex);
// newVelm[newIndex] = velm->get(oldIndex);
// newCellOffsets[newIndex] = posCellOffsets[oldIndex];
// }
// }
// }
//
// // Update the streams.
//
// for (int i = 0; i < numAtoms; i++) {
// posq->set(i, newPosq[i]);
// velm->set(i, newVelm[i]);
// atomIndex->set(i, originalIndex[i]);
// posCellOffsets[i] = newCellOffsets[i];
// }
// posq->upload();
// velm->upload();
// atomIndex->upload();
// for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute();
//}
struct
CudaContext
::
WorkThread
::
ThreadData
{
ThreadData
(
std
::
queue
<
CudaContext
::
WorkTask
*>&
tasks
,
bool
&
waiting
,
bool
&
finished
,
pthread_mutex_t
&
queueLock
,
pthread_cond_t
&
waitForTaskCondition
,
pthread_cond_t
&
queueEmptyCondition
)
:
tasks
(
tasks
),
waiting
(
waiting
),
finished
(
finished
),
queueLock
(
queueLock
),
waitForTaskCondition
(
waitForTaskCondition
),
queueEmptyCondition
(
queueEmptyCondition
)
{
}
std
::
queue
<
CudaContext
::
WorkTask
*>&
tasks
;
bool
&
waiting
;
bool
&
finished
;
pthread_mutex_t
&
queueLock
;
pthread_cond_t
&
waitForTaskCondition
;
pthread_cond_t
&
queueEmptyCondition
;
};
static
void
*
threadBody
(
void
*
args
)
{
CudaContext
::
WorkThread
::
ThreadData
&
data
=
*
reinterpret_cast
<
CudaContext
::
WorkThread
::
ThreadData
*>
(
args
);
while
(
!
data
.
finished
||
data
.
tasks
.
size
()
>
0
)
{
pthread_mutex_lock
(
&
data
.
queueLock
);
while
(
data
.
tasks
.
empty
()
&&
!
data
.
finished
)
{
data
.
waiting
=
true
;
pthread_cond_signal
(
&
data
.
queueEmptyCondition
);
pthread_cond_wait
(
&
data
.
waitForTaskCondition
,
&
data
.
queueLock
);
}
CudaContext
::
WorkTask
*
task
=
NULL
;
if
(
!
data
.
tasks
.
empty
())
{
data
.
waiting
=
false
;
task
=
data
.
tasks
.
front
();
data
.
tasks
.
pop
();
}
pthread_mutex_unlock
(
&
data
.
queueLock
);
if
(
task
!=
NULL
)
{
task
->
execute
();
delete
task
;
}
}
data
.
waiting
=
true
;
pthread_cond_signal
(
&
data
.
queueEmptyCondition
);
delete
&
data
;
return
0
;
}
CudaContext
::
WorkThread
::
WorkThread
()
:
waiting
(
true
),
finished
(
false
)
{
pthread_mutex_init
(
&
queueLock
,
NULL
);
pthread_cond_init
(
&
waitForTaskCondition
,
NULL
);
pthread_cond_init
(
&
queueEmptyCondition
,
NULL
);
ThreadData
*
data
=
new
ThreadData
(
tasks
,
waiting
,
finished
,
queueLock
,
waitForTaskCondition
,
queueEmptyCondition
);
pthread_create
(
&
thread
,
NULL
,
threadBody
,
data
);
}
CudaContext
::
WorkThread
::~
WorkThread
()
{
pthread_mutex_lock
(
&
queueLock
);
finished
=
true
;
pthread_cond_broadcast
(
&
waitForTaskCondition
);
pthread_mutex_unlock
(
&
queueLock
);
pthread_join
(
thread
,
NULL
);
pthread_mutex_destroy
(
&
queueLock
);
pthread_cond_destroy
(
&
waitForTaskCondition
);
pthread_cond_destroy
(
&
queueEmptyCondition
);
}
void
CudaContext
::
WorkThread
::
addTask
(
CudaContext
::
WorkTask
*
task
)
{
pthread_mutex_lock
(
&
queueLock
);
tasks
.
push
(
task
);
waiting
=
false
;
pthread_cond_signal
(
&
waitForTaskCondition
);
pthread_mutex_unlock
(
&
queueLock
);
}
bool
CudaContext
::
WorkThread
::
isWaiting
()
{
return
waiting
;
}
bool
CudaContext
::
WorkThread
::
isFinished
()
{
return
finished
;
}
void
CudaContext
::
WorkThread
::
flush
()
{
pthread_mutex_lock
(
&
queueLock
);
while
(
!
waiting
)
pthread_cond_wait
(
&
queueEmptyCondition
,
&
queueLock
);
pthread_mutex_unlock
(
&
queueLock
);
}
platforms/cuda2/src/CudaContext.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDACONTEXT_H_
#define OPENMM_CUDACONTEXT_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <map>
#include <queue>
#include <string>
#include <pthread.h>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
#define NOMINMAX
#endif
#include <cuda.h>
#include <builtin_types.h>
#include <vector_functions.h>
#include "openmm/internal/windowsExport.h"
#include "CudaPlatform.h"
namespace
OpenMM
{
class
CudaArray
;
class
CudaForceInfo
;
class
CudaIntegrationUtilities
;
class
CudaBondedUtilities
;
class
CudaNonbondedUtilities
;
class
System
;
/**
* This class contains the information associated with a Context by the CUDA Platform. Each CudaContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate CudaContext for each one. The list of all contexts is
* stored in the CudaPlatform::PlatformData.
* <p>
* In addition, a worker thread is created for each CudaContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread.
*/
class
OPENMM_EXPORT
CudaContext
{
public:
class
WorkTask
;
class
WorkThread
;
class
ReorderListener
;
static
const
int
ThreadBlockSize
;
static
const
int
TileSize
;
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
// /**
// * This is called to initialize internal data structures after all Forces in the system
// * have been initialized.
// */
// void initialize();
/**
* Add a CudaForce to this context.
*/
void
addForce
(
CudaForceInfo
*
force
);
/**
* Get the CUcontext associated with this object.
*/
CUcontext
getContext
()
{
return
context
;
}
/**
* Get the CUdevice associated with this object.
*/
CUdevice
getDevice
()
{
return
device
;
}
/**
* Get the index of the CUdevice associated with this object.
*/
int
getDeviceIndex
()
{
return
deviceIndex
;
}
/**
* Get the PlatformData object this context is part of.
*/
CudaPlatform
::
PlatformData
&
getPlatformData
()
{
return
platformData
;
}
/**
* Get the index of this context in the list stored in the PlatformData.
*/
int
getContextIndex
()
const
{
return
contextIndex
;
}
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
CudaArray
&
getPosq
()
{
return
*
posq
;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
CudaArray
&
getVelm
()
{
return
*
velm
;
}
// /**
// * Get the array which contains the force on each atom.
// */
// CudaArray<mm_float4>& getForce() {
// return *force;
// }
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
// CudaArray<mm_float4>& getForceBuffers() {
// return *forceBuffers;
// }
// /**
// * Get the array which contains a contribution to each force represented as 64 bit fixed point.
// */
// CudaArray<cl_long>& getLongForceBuffer() {
// return *longForceBuffer;
// }
// /**
// * Get the array which contains the buffer in which energy is computed.
// */
// CudaArray<cl_float>& getEnergyBuffer() {
// return *energyBuffer;
// }
// /**
// * Get the array which contains the index of each atom.
// */
// CudaArray<cl_int>& getAtomIndex() {
// return *atomIndex;
// }
// /**
// * Get the number of cells by which the positions are offset.
// */
// std::vector<mm_int4>& getPosCellOffsets() {
// return posCellOffsets;
// }
/**
* Replace all occurrences of a list of substrings.
*
* @param input a string to process
* @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
* @return a new string produced by performing the replacements
*/
std
::
string
replaceStrings
(
const
std
::
string
&
input
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
replacements
)
const
;
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
char
*
optimizationFlags
=
NULL
);
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
// void clearAutoclearBuffers();
// /**
// * Given a collection of buffers packed into an array, sum them and store
// * the sum in the first buffer.
// *
// * @param array the array containing the buffers to reduce
// * @param numBuffers the number of buffers packed into the array
// */
// void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
// /**
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
// /**
// * Get the size of the periodic box.
// */
// mm_float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
// mm_float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
// * Get the CudaIntegrationUtilities for this context.
// */
// CudaIntegrationUtilities& getIntegrationUtilities() {
// return *integration;
// }
// /**
// * Get the CudaBondedUtilities for this context.
// */
// CudaBondedUtilities& getBondedUtilities() {
// return *bonded;
// }
// /**
// * Get the CudaNonbondedUtilities for this context.
// */
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
// /**
// * Get the thread used by this context for executing parallel computations.
// */
// WorkThread& getWorkThread() {
// return *thread;
// }
// /**
// * Get whether atoms were reordered during the most recent force/energy computation.
// */
// bool getAtomsWereReordered() const {
// return atomsWereReordered;
// }
// /**
// * Set whether atoms were reordered during the most recent force/energy computation.
// */
// void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered;
// }
// /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays.
// *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */
// void reorderAtoms(bool enforcePeriodic);
// /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted.
// */
// void addReorderListener(ReorderListener* listener);
// /**
// * Get the list of ReorderListeners.
// */
// std::vector<ReorderListener*>& getReorderListeners() {
// return reorderListeners;
// }
// /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called.
// */
// void invalidateMolecules();
// /**
// * Get whether the current molecule definitions are valid.
// */
// bool getMoleculesAreInvalid() {
// return moleculesInvalid;
// }
private:
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
// void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
// /**
// * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms.
// */
// void validateMolecules();
static
bool
hasInitializedCuda
;
const
System
&
system
;
double
time
;
CudaPlatform
::
PlatformData
&
platformData
;
int
deviceIndex
;
int
contextIndex
;
int
stepCount
;
int
computeForceCount
;
int
numAtoms
;
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
float4
invPeriodicBoxSize
;
std
::
string
defaultOptimizationOptions
;
std
::
map
<
std
::
string
,
std
::
string
>
compilationDefines
;
CUcontext
context
;
CUdevice
device
;
CUfunction
clearBufferKernel
;
CUfunction
clearTwoBuffersKernel
;
CUfunction
clearThreeBuffersKernel
;
CUfunction
clearFourBuffersKernel
;
CUfunction
clearFiveBuffersKernel
;
CUfunction
clearSixBuffersKernel
;
CUfunction
reduceFloat4Kernel
;
CUfunction
reduceForcesKernel
;
std
::
vector
<
CudaForceInfo
*>
forces
;
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
CudaArray
*
posq
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded;
WorkThread
*
thread
;
};
struct
CudaContext
::
Molecule
{
std
::
vector
<
int
>
atoms
;
std
::
vector
<
int
>
constraints
;
std
::
vector
<
std
::
vector
<
int
>
>
groups
;
};
struct
CudaContext
::
MoleculeGroup
{
std
::
vector
<
int
>
atoms
;
std
::
vector
<
int
>
instances
;
std
::
vector
<
int
>
offsets
;
};
/**
* This abstract class defines a task to be executed on the worker thread.
*/
class
CudaContext
::
WorkTask
{
public:
virtual
void
execute
()
=
0
;
virtual
~
WorkTask
()
{
}
};
class
CudaContext
::
WorkThread
{
public:
struct
ThreadData
;
WorkThread
();
~
WorkThread
();
/**
* Request that a task be executed on the worker thread. The argument should have been allocated on the
* heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically.
*/
void
addTask
(
CudaContext
::
WorkTask
*
task
);
/**
* Get whether the worker thread is idle, waiting for a task to be added.
*/
bool
isWaiting
();
/**
* Get whether the worker thread has exited.
*/
bool
isFinished
();
/**
* Block until all tasks have finished executing and the worker thread is idle.
*/
void
flush
();
private:
std
::
queue
<
CudaContext
::
WorkTask
*>
tasks
;
bool
waiting
,
finished
;
pthread_mutex_t
queueLock
;
pthread_cond_t
waitForTaskCondition
,
queueEmptyCondition
;
pthread_t
thread
;
};
/**
* This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a reorderListener
* and register it by calling addReorderListener().
*/
class
CudaContext
::
ReorderListener
{
public:
virtual
void
execute
()
=
0
;
virtual
~
ReorderListener
()
{
}
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDACONTEXT_H_*/
platforms/cuda2/src/CudaExpressionUtilities.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaExpressionUtilities.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/SplineFitter.h"
#include "lepton/Operation.h"
using
namespace
OpenMM
;
using
namespace
Lepton
;
using
namespace
std
;
string
CudaExpressionUtilities
::
doubleToString
(
double
value
)
{
stringstream
s
;
s
.
precision
(
8
);
s
<<
scientific
<<
value
<<
"f"
;
return
s
.
str
();
}
string
CudaExpressionUtilities
::
intToString
(
int
value
)
{
stringstream
s
;
s
<<
value
;
return
s
.
str
();
}
string
CudaExpressionUtilities
::
createExpressions
(
const
map
<
string
,
ParsedExpression
>&
expressions
,
const
map
<
string
,
string
>&
variables
,
const
vector
<
pair
<
string
,
string
>
>&
functions
,
const
string
&
prefix
,
const
string
&
functionParams
,
const
string
&
tempType
)
{
vector
<
pair
<
ExpressionTreeNode
,
string
>
>
variableNodes
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
variables
.
begin
();
iter
!=
variables
.
end
();
++
iter
)
variableNodes
.
push_back
(
make_pair
(
ExpressionTreeNode
(
new
Operation
::
Variable
(
iter
->
first
)),
iter
->
second
));
return
createExpressions
(
expressions
,
variableNodes
,
functions
,
prefix
,
functionParams
,
tempType
);
}
string
CudaExpressionUtilities
::
createExpressions
(
const
map
<
string
,
ParsedExpression
>&
expressions
,
const
vector
<
pair
<
ExpressionTreeNode
,
string
>
>&
variables
,
const
vector
<
pair
<
string
,
string
>
>&
functions
,
const
string
&
prefix
,
const
string
&
functionParams
,
const
string
&
tempType
)
{
stringstream
out
;
vector
<
ParsedExpression
>
allExpressions
;
for
(
map
<
string
,
ParsedExpression
>::
const_iterator
iter
=
expressions
.
begin
();
iter
!=
expressions
.
end
();
++
iter
)
allExpressions
.
push_back
(
iter
->
second
);
vector
<
pair
<
ExpressionTreeNode
,
string
>
>
temps
=
variables
;
for
(
map
<
string
,
ParsedExpression
>::
const_iterator
iter
=
expressions
.
begin
();
iter
!=
expressions
.
end
();
++
iter
)
{
processExpression
(
out
,
iter
->
second
.
getRootNode
(),
temps
,
functions
,
prefix
,
functionParams
,
allExpressions
,
tempType
);
out
<<
iter
->
first
<<
getTempName
(
iter
->
second
.
getRootNode
(),
temps
)
<<
";
\n
"
;
}
return
out
.
str
();
}
void
CudaExpressionUtilities
::
processExpression
(
stringstream
&
out
,
const
ExpressionTreeNode
&
node
,
vector
<
pair
<
ExpressionTreeNode
,
string
>
>&
temps
,
const
vector
<
pair
<
string
,
string
>
>&
functions
,
const
string
&
prefix
,
const
string
&
functionParams
,
const
vector
<
ParsedExpression
>&
allExpressions
,
const
string
&
tempType
)
{
for
(
int
i
=
0
;
i
<
(
int
)
temps
.
size
();
i
++
)
if
(
temps
[
i
].
first
==
node
)
return
;
for
(
int
i
=
0
;
i
<
(
int
)
node
.
getChildren
().
size
();
i
++
)
processExpression
(
out
,
node
.
getChildren
()[
i
],
temps
,
functions
,
prefix
,
functionParams
,
allExpressions
,
tempType
);
string
name
=
prefix
+
intToString
(
temps
.
size
());
bool
hasRecordedNode
=
false
;
out
<<
tempType
<<
" "
<<
name
<<
" = "
;
switch
(
node
.
getOperation
().
getId
())
{
case
Operation
::
CONSTANT
:
out
<<
doubleToString
(
dynamic_cast
<
const
Operation
::
Constant
*>
(
&
node
.
getOperation
())
->
getValue
());
break
;
case
Operation
::
VARIABLE
:
throw
OpenMMException
(
"Unknown variable in expression: "
+
node
.
getOperation
().
getName
());
case
Operation
::
CUSTOM
:
{
int
i
;
for
(
i
=
0
;
i
<
(
int
)
functions
.
size
()
&&
functions
[
i
].
first
!=
node
.
getOperation
().
getName
();
i
++
)
;
if
(
i
==
functions
.
size
())
throw
OpenMMException
(
"Unknown function in expression: "
+
node
.
getOperation
().
getName
());
bool
isDeriv
=
(
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
node
.
getOperation
())
->
getDerivOrder
()[
0
]
==
1
);
out
<<
"0.0f;
\n
"
;
temps
.
push_back
(
make_pair
(
node
,
name
));
hasRecordedNode
=
true
;
// If both the value and derivative of the function are needed, it's faster to calculate them both
// at once, so check to see if both are needed.
const
ExpressionTreeNode
*
valueNode
=
NULL
;
const
ExpressionTreeNode
*
derivNode
=
NULL
;
for
(
int
j
=
0
;
j
<
(
int
)
allExpressions
.
size
();
j
++
)
findRelatedTabulatedFunctions
(
node
,
allExpressions
[
j
].
getRootNode
(),
valueNode
,
derivNode
);
string
valueName
=
name
;
string
derivName
=
name
;
if
(
valueNode
!=
NULL
&&
derivNode
!=
NULL
)
{
string
name2
=
prefix
+
intToString
(
temps
.
size
());
out
<<
tempType
<<
" "
<<
name2
<<
" = 0.0f;
\n
"
;
if
(
isDeriv
)
{
valueName
=
name2
;
temps
.
push_back
(
make_pair
(
*
valueNode
,
name2
));
}
else
{
derivName
=
name2
;
temps
.
push_back
(
make_pair
(
*
derivNode
,
name2
));
}
}
out
<<
"{
\n
"
;
out
<<
"float4 params = "
<<
functionParams
<<
"["
<<
i
<<
"];
\n
"
;
out
<<
"float x = "
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
";
\n
"
;
out
<<
"if (x >= params.x && x <= params.y) {
\n
"
;
out
<<
"x = (x-params.x)*params.z;
\n
"
;
out
<<
"int index = (int) (floor(x));
\n
"
;
out
<<
"index = min(index, (int) params.w);
\n
"
;
out
<<
"float4 coeff = "
<<
functions
[
i
].
second
<<
"[index];
\n
"
;
out
<<
"float b = x-index;
\n
"
;
out
<<
"float a = 1.0f-b;
\n
"
;
if
(
valueNode
!=
NULL
)
out
<<
valueName
<<
" = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);
\n
"
;
if
(
derivNode
!=
NULL
)
out
<<
derivName
<<
" = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;
\n
"
;
out
<<
"}
\n
"
;
out
<<
"}"
;
break
;
}
case
Operation
::
ADD
:
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"+"
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
);
break
;
case
Operation
::
SUBTRACT
:
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"-"
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
);
break
;
case
Operation
::
MULTIPLY
:
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"*"
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
);
break
;
case
Operation
::
DIVIDE
:
{
bool
haveReciprocal
=
false
;
for
(
int
i
=
0
;
i
<
(
int
)
temps
.
size
();
i
++
)
if
(
temps
[
i
].
first
.
getOperation
().
getId
()
==
Operation
::
RECIPROCAL
&&
temps
[
i
].
first
.
getChildren
()[
0
]
==
node
.
getChildren
()[
1
])
{
haveReciprocal
=
true
;
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"*"
<<
temps
[
i
].
second
;
}
if
(
!
haveReciprocal
)
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"/"
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
);
break
;
}
case
Operation
::
POWER
:
out
<<
"pow("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
", "
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
")"
;
break
;
case
Operation
::
NEGATE
:
out
<<
"-"
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
);
break
;
case
Operation
::
SQRT
:
out
<<
"sqrt("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
EXP
:
out
<<
"EXP("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
LOG
:
out
<<
"LOG("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
SIN
:
out
<<
"sin("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
COS
:
out
<<
"cos("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
SEC
:
out
<<
"1.0f/cos("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
CSC
:
out
<<
"1.0f/sin("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
TAN
:
out
<<
"tan("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
COT
:
out
<<
"1.0f/tan("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ASIN
:
out
<<
"asin("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ACOS
:
out
<<
"acos("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ATAN
:
out
<<
"atan("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
SINH
:
out
<<
"sinh("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
COSH
:
out
<<
"cosh("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
TANH
:
out
<<
"tanh("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ERF
:
out
<<
"erf("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ERFC
:
out
<<
"erfc("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
STEP
:
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
" >= 0.0f ? 1.0f : 0.0f"
;
break
;
case
Operation
::
DELTA
:
out
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
" == 0.0f ? 1.0f : 0.0f"
;
break
;
case
Operation
::
SQUARE
:
{
string
arg
=
getTempName
(
node
.
getChildren
()[
0
],
temps
);
out
<<
arg
<<
"*"
<<
arg
;
break
;
}
case
Operation
::
CUBE
:
{
string
arg
=
getTempName
(
node
.
getChildren
()[
0
],
temps
);
out
<<
arg
<<
"*"
<<
arg
<<
"*"
<<
arg
;
break
;
}
case
Operation
::
RECIPROCAL
:
out
<<
"RECIP("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
ADD_CONSTANT
:
out
<<
doubleToString
(
dynamic_cast
<
const
Operation
::
AddConstant
*>
(
&
node
.
getOperation
())
->
getValue
())
<<
"+"
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
);
break
;
case
Operation
::
MULTIPLY_CONSTANT
:
out
<<
doubleToString
(
dynamic_cast
<
const
Operation
::
MultiplyConstant
*>
(
&
node
.
getOperation
())
->
getValue
())
<<
"*"
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
);
break
;
case
Operation
::
POWER_CONSTANT
:
{
double
exponent
=
dynamic_cast
<
const
Operation
::
PowerConstant
*>
(
&
node
.
getOperation
())
->
getValue
();
if
(
exponent
==
0.0
)
out
<<
"1.0f"
;
else
if
(
exponent
==
(
int
)
exponent
)
{
out
<<
"0.0f;
\n
"
;
temps
.
push_back
(
make_pair
(
node
,
name
));
hasRecordedNode
=
true
;
// If multiple integral powers of the same base are needed, it's faster to calculate all of them
// at once, so check to see if others are also needed.
map
<
int
,
const
ExpressionTreeNode
*>
powers
;
powers
[(
int
)
exponent
]
=
&
node
;
for
(
int
j
=
0
;
j
<
(
int
)
allExpressions
.
size
();
j
++
)
findRelatedPowers
(
node
,
allExpressions
[
j
].
getRootNode
(),
powers
);
vector
<
int
>
exponents
;
vector
<
string
>
names
;
vector
<
bool
>
hasAssigned
(
powers
.
size
(),
false
);
exponents
.
push_back
((
int
)
fabs
(
exponent
));
names
.
push_back
(
name
);
for
(
map
<
int
,
const
ExpressionTreeNode
*>::
const_iterator
iter
=
powers
.
begin
();
iter
!=
powers
.
end
();
++
iter
)
{
if
(
iter
->
first
!=
exponent
)
{
exponents
.
push_back
(
iter
->
first
>=
0
?
iter
->
first
:
-
iter
->
first
);
string
name2
=
prefix
+
intToString
(
temps
.
size
());
names
.
push_back
(
name2
);
temps
.
push_back
(
make_pair
(
*
iter
->
second
,
name2
));
out
<<
tempType
<<
" "
<<
name2
<<
" = 0.0f;
\n
"
;
}
}
out
<<
"{
\n
"
;
out
<<
"float multiplier = "
<<
(
exponent
<
0.0
?
"1.0f/"
:
""
)
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
";
\n
"
;
bool
done
=
false
;
while
(
!
done
)
{
done
=
true
;
for
(
int
i
=
0
;
i
<
(
int
)
exponents
.
size
();
i
++
)
{
if
(
exponents
[
i
]
%
2
==
1
)
{
if
(
!
hasAssigned
[
i
])
out
<<
names
[
i
]
<<
" = multiplier;
\n
"
;
else
out
<<
names
[
i
]
<<
" *= multiplier;
\n
"
;
hasAssigned
[
i
]
=
true
;
}
exponents
[
i
]
>>=
1
;
if
(
exponents
[
i
]
!=
0
)
done
=
false
;
}
if
(
!
done
)
out
<<
"multiplier *= multiplier;
\n
"
;
}
out
<<
"}"
;
}
else
out
<<
"pow("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
", "
<<
doubleToString
(
exponent
)
<<
")"
;
break
;
}
case
Operation
::
MIN
:
out
<<
"min("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
", "
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
")"
;
break
;
case
Operation
::
MAX
:
out
<<
"max("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
", "
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
")"
;
break
;
case
Operation
::
ABS
:
out
<<
"fabs("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
default:
throw
OpenMMException
(
"Internal error: Unknown operation in user-defined expression: "
+
node
.
getOperation
().
getName
());
}
out
<<
";
\n
"
;
if
(
!
hasRecordedNode
)
temps
.
push_back
(
make_pair
(
node
,
name
));
}
string
CudaExpressionUtilities
::
getTempName
(
const
ExpressionTreeNode
&
node
,
const
vector
<
pair
<
ExpressionTreeNode
,
string
>
>&
temps
)
{
for
(
int
i
=
0
;
i
<
(
int
)
temps
.
size
();
i
++
)
if
(
temps
[
i
].
first
==
node
)
return
temps
[
i
].
second
;
stringstream
out
;
out
<<
"Internal error: No temporary variable for expression node: "
<<
node
;
throw
OpenMMException
(
out
.
str
());
}
void
CudaExpressionUtilities
::
findRelatedTabulatedFunctions
(
const
ExpressionTreeNode
&
node
,
const
ExpressionTreeNode
&
searchNode
,
const
ExpressionTreeNode
*&
valueNode
,
const
ExpressionTreeNode
*&
derivNode
)
{
if
(
searchNode
.
getOperation
().
getId
()
==
Operation
::
CUSTOM
&&
node
.
getChildren
()[
0
]
==
searchNode
.
getChildren
()[
0
])
{
if
(
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
searchNode
.
getOperation
())
->
getDerivOrder
()[
0
]
==
0
)
valueNode
=
&
searchNode
;
else
derivNode
=
&
searchNode
;
}
else
for
(
int
i
=
0
;
i
<
(
int
)
searchNode
.
getChildren
().
size
();
i
++
)
findRelatedTabulatedFunctions
(
node
,
searchNode
.
getChildren
()[
i
],
valueNode
,
derivNode
);
}
void
CudaExpressionUtilities
::
findRelatedPowers
(
const
ExpressionTreeNode
&
node
,
const
ExpressionTreeNode
&
searchNode
,
map
<
int
,
const
ExpressionTreeNode
*>&
powers
)
{
if
(
searchNode
.
getOperation
().
getId
()
==
Operation
::
POWER_CONSTANT
&&
node
.
getChildren
()[
0
]
==
searchNode
.
getChildren
()[
0
])
{
double
realPower
=
dynamic_cast
<
const
Operation
::
PowerConstant
*>
(
&
searchNode
.
getOperation
())
->
getValue
();
int
power
=
(
int
)
realPower
;
if
(
power
!=
realPower
)
return
;
// We are only interested in integer powers.
if
(
powers
.
find
(
power
)
!=
powers
.
end
())
return
;
// This power is already in the map.
if
(
powers
.
begin
()
->
first
*
power
<
0
)
return
;
// All powers must have the same sign.
powers
[
power
]
=
&
searchNode
;
}
else
for
(
int
i
=
0
;
i
<
(
int
)
searchNode
.
getChildren
().
size
();
i
++
)
findRelatedPowers
(
node
,
searchNode
.
getChildren
()[
i
],
powers
);
}
vector
<
float4
>
CudaExpressionUtilities
::
computeFunctionCoefficients
(
const
vector
<
double
>&
values
,
double
min
,
double
max
)
{
// Compute the spline coefficients.
int
numValues
=
values
.
size
();
vector
<
double
>
x
(
numValues
),
derivs
;
for
(
int
i
=
0
;
i
<
numValues
;
i
++
)
x
[
i
]
=
min
+
i
*
(
max
-
min
)
/
(
numValues
-
1
);
SplineFitter
::
createNaturalSpline
(
x
,
values
,
derivs
);
vector
<
float4
>
f
(
numValues
-
1
);
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
()
-
1
;
i
++
)
f
[
i
]
=
make_float4
((
float
)
values
[
i
],
(
float
)
values
[
i
+
1
],
(
float
)
(
derivs
[
i
]
/
6.0
),
(
float
)
(
derivs
[
i
+
1
]
/
6.0
));
return
f
;
}
platforms/cuda2/src/CudaExpressionUtilities.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAEXPRESSIONUTILITIES_H_
#define OPENMM_CUDAEXPRESSIONUTILITIES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaContext.h"
#include "lepton/CustomFunction.h"
#include "lepton/ExpressionTreeNode.h"
#include "lepton/ParsedExpression.h"
#include <map>
#include <sstream>
#include <string>
#include <utility>
namespace
OpenMM
{
/**
* This class is used by various classes to generate CUDA source code implementing
* user defined mathematical expressions.
*/
class
OPENMM_EXPORT
CudaExpressionUtilities
{
public:
/**
* Generate the source code for calculating a set of expressions.
*
* @param expressions the expressions to generate code for (keys are the variables to store the output values in)
* @param variables defines the source code to generate for each variable that may appear in the expressions. Keys are
* variable names, and the values are the code to generate for them.
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
*/
static
std
::
string
createExpressions
(
const
std
::
map
<
std
::
string
,
Lepton
::
ParsedExpression
>&
expressions
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
variables
,
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>
>&
functions
,
const
std
::
string
&
prefix
,
const
std
::
string
&
functionParams
,
const
std
::
string
&
tempType
=
"float"
);
/**
* Generate the source code for calculating a set of expressions.
*
* @param expressions the expressions to generate code for (keys are the variables to store the output values in)
* @param variables defines the source code to generate for each variable or precomputed sub-expression that may appear in the expressions.
* Each entry is an ExpressionTreeNode, and the code to generate wherever an identical node appears.
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
*/
static
std
::
string
createExpressions
(
const
std
::
map
<
std
::
string
,
Lepton
::
ParsedExpression
>&
expressions
,
const
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
variables
,
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>
>&
functions
,
const
std
::
string
&
prefix
,
const
std
::
string
&
functionParams
,
const
std
::
string
&
tempType
=
"float"
);
/**
* Calculate the spline coefficients for a tabulated function that appears in expressions.
*
* @param values the tabulated values of the function
* @param min the value of the independent variable corresponding to the first element of values
* @param max the value of the independent variable corresponding to the last element of values
* @return the spline coefficients
*/
static
std
::
vector
<
float4
>
computeFunctionCoefficients
(
const
std
::
vector
<
double
>&
values
,
double
min
,
double
max
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static
std
::
string
doubleToString
(
double
value
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static
std
::
string
intToString
(
int
value
);
class
FunctionPlaceholder
;
private:
static
void
processExpression
(
std
::
stringstream
&
out
,
const
Lepton
::
ExpressionTreeNode
&
node
,
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
temps
,
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>
>&
functions
,
const
std
::
string
&
prefix
,
const
std
::
string
&
functionParams
,
const
std
::
vector
<
Lepton
::
ParsedExpression
>&
allExpressions
,
const
std
::
string
&
tempType
);
static
std
::
string
getTempName
(
const
Lepton
::
ExpressionTreeNode
&
node
,
const
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
temps
);
static
void
findRelatedTabulatedFunctions
(
const
Lepton
::
ExpressionTreeNode
&
node
,
const
Lepton
::
ExpressionTreeNode
&
searchNode
,
const
Lepton
::
ExpressionTreeNode
*&
valueNode
,
const
Lepton
::
ExpressionTreeNode
*&
derivNode
);
static
void
findRelatedPowers
(
const
Lepton
::
ExpressionTreeNode
&
node
,
const
Lepton
::
ExpressionTreeNode
&
searchNode
,
std
::
map
<
int
,
const
Lepton
::
ExpressionTreeNode
*>&
powers
);
};
/**
* This class serves as a placeholder for custom functions in expressions.
*/
class
CudaExpressionUtilities
::
FunctionPlaceholder
:
public
Lepton
::
CustomFunction
{
public:
int
getNumArguments
()
const
{
return
1
;
}
double
evaluate
(
const
double
*
arguments
)
const
{
return
0.0
;
}
double
evaluateDerivative
(
const
double
*
arguments
,
const
int
*
derivOrder
)
const
{
return
0.0
;
}
CustomFunction
*
clone
()
const
{
return
new
FunctionPlaceholder
();
}
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAEXPRESSIONUTILITIES_H_*/
platforms/cuda2/src/CudaForceInfo.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaForceInfo.h"
using
namespace
OpenMM
;
using
namespace
std
;
bool
CudaForceInfo
::
areParticlesIdentical
(
int
particle1
,
int
particle2
)
{
return
true
;
}
int
CudaForceInfo
::
getNumParticleGroups
()
{
return
0
;
}
void
CudaForceInfo
::
getParticlesInGroup
(
int
index
,
vector
<
int
>&
particles
)
{
return
;
}
bool
CudaForceInfo
::
areGroupsIdentical
(
int
group1
,
int
group2
)
{
return
true
;
}
platforms/cuda2/src/CudaForceInfo.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAFORCEINFO_H_
#define OPENMM_CUDAFORCEINFO_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <vector>
namespace
OpenMM
{
/**
* This class is used by the Cuda implementation of a Force class to convey information
* about the behavior and requirements of that force.
*/
class
OPENMM_EXPORT
CudaForceInfo
{
public:
CudaForceInfo
(
int
requiredForceBuffers
)
:
requiredForceBuffers
(
requiredForceBuffers
)
{
}
/**
* Get the number of force buffers this force requires.
*/
int
getRequiredForceBuffers
()
{
return
requiredForceBuffers
;
}
/**
* Get whether or not two particles have identical force field parameters.
*/
virtual
bool
areParticlesIdentical
(
int
particle1
,
int
particle2
);
/**
* Get the number of particle groups defined by this force.
*/
virtual
int
getNumParticleGroups
();
/**
* Get the list of particles in a particular group.
*/
virtual
void
getParticlesInGroup
(
int
index
,
std
::
vector
<
int
>&
particles
);
/**
* Get whether two particle groups are identical.
*/
virtual
bool
areGroupsIdentical
(
int
group1
,
int
group2
);
private:
int
requiredForceBuffers
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAFORCEINFO_H_*/
platforms/cuda2/src/CudaKernelFactory.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelFactory.h"
//#include "CudaParallelKernels.h"
#include "CudaPlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using
namespace
OpenMM
;
KernelImpl
*
CudaKernelFactory
::
createKernelImpl
(
std
::
string
name
,
const
Platform
&
platform
,
ContextImpl
&
context
)
const
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
// if (data.contexts.size() > 1) {
// // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
//
// if (name == CalcForcesAndEnergyKernel::Name())
// return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
// if (name == CalcHarmonicBondForceKernel::Name())
// return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomBondForceKernel::Name())
// return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcHarmonicAngleForceKernel::Name())
// return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomAngleForceKernel::Name())
// return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
// if (name == CalcPeriodicTorsionForceKernel::Name())
// return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcRBTorsionForceKernel::Name())
// return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCMAPTorsionForceKernel::Name())
// return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomTorsionForceKernel::Name())
// return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name())
// return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomExternalForceKernel::Name())
// return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomHbondForceKernel::Name())
// return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomCompoundBondForceKernel::Name())
// return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
// }
// CudaContext& cl = *data.contexts[0];
// if (name == CalcForcesAndEnergyKernel::Name())
// return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
// if (name == UpdateStateDataKernel::Name())
// return new CudaUpdateStateDataKernel(name, platform, cl);
// if (name == ApplyConstraintsKernel::Name())
// return new CudaApplyConstraintsKernel(name, platform, cl);
// if (name == VirtualSitesKernel::Name())
// return new CudaVirtualSitesKernel(name, platform, cl);
// if (name == CalcHarmonicBondForceKernel::Name())
// return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomBondForceKernel::Name())
// return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcHarmonicAngleForceKernel::Name())
// return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomAngleForceKernel::Name())
// return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcPeriodicTorsionForceKernel::Name())
// return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcRBTorsionForceKernel::Name())
// return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCMAPTorsionForceKernel::Name())
// return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomTorsionForceKernel::Name())
// return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name())
// return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcGBSAOBCForceKernel::Name())
// return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
// if (name == CalcCustomGBForceKernel::Name())
// return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomExternalForceKernel::Name())
// return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomHbondForceKernel::Name())
// return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomCompoundBondForceKernel::Name())
// return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
// if (name == IntegrateVerletStepKernel::Name())
// return new CudaIntegrateVerletStepKernel(name, platform, cl);
// if (name == IntegrateLangevinStepKernel::Name())
// return new CudaIntegrateLangevinStepKernel(name, platform, cl);
// if (name == IntegrateBrownianStepKernel::Name())
// return new CudaIntegrateBrownianStepKernel(name, platform, cl);
// if (name == IntegrateVariableVerletStepKernel::Name())
// return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
// if (name == IntegrateVariableLangevinStepKernel::Name())
// return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
// if (name == IntegrateCustomStepKernel::Name())
// return new CudaIntegrateCustomStepKernel(name, platform, cl);
// if (name == ApplyAndersenThermostatKernel::Name())
// return new CudaApplyAndersenThermostatKernel(name, platform, cl);
// if (name == ApplyMonteCarloBarostatKernel::Name())
// return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
// if (name == CalcKineticEnergyKernel::Name())
// return new CudaCalcKineticEnergyKernel(name, platform, cl);
// if (name == RemoveCMMotionKernel::Name())
// return new CudaRemoveCMMotionKernel(name, platform, cl);
throw
OpenMMException
((
std
::
string
(
"Tried to create kernel with illegal kernel name '"
)
+
name
+
"'"
).
c_str
());
}
platforms/cuda2/src/CudaKernelSources.cpp.in
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelSources.h"
using namespace OpenMM;
using namespace std;
platforms/cuda2/src/CudaKernelSources.h.in
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAKERNELSOURCES_H_
#define OPENMM_CUDAKERNELSOURCES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <string>
namespace OpenMM {
/**
* This class is a central holding place for the source code of CUDA kernels.
* The CMake build script inserts declarations into it based on the .cu files in the
* kernels subfolder.
*/
class OPENMM_EXPORT CudaKernelSources {
public:
@CUDA_FILE_DECLARATIONS@
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELSOURCES_H_*/
platforms/cuda2/src/CudaKernels.h
0 → 100644
View file @
abb8cb4b
#ifndef OPENMM_CUDAKERNELS_H_
#define OPENMM_CUDAKERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
//#include "CudaArray.h"
#include "CudaContext.h"
//#include "CudaFFT3D.h"
//#include "CudaParameterSet.h"
//#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
namespace
OpenMM
{
///**
// * This kernel is invoked at the beginning and end of force and energy computations. It gives the
// * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
// * necessary work at the end to determine the final results.
// */
//class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
//public:
// CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcForcesAndEnergyKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
// * any ForceImpl.
// *
// * @param context the context in which to execute this kernel
// * @param includeForce true if forces should be computed
// * @param includeEnergy true if potential energy should be computed
// * @param groups a set of bit flags for which force groups to include
// */
// void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
// /**
// * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
// * every ForceImpl.
// *
// * @param context the context in which to execute this kernel
// * @param includeForce true if forces should be computed
// * @param includeEnergy true if potential energy should be computed
// * @param groups a set of bit flags for which force groups to include
// * @return the potential energy of the system. This value is added to all values returned by ForceImpls'
// * calcForcesAndEnergy() methods. That is, each force kernel may <i>either</i> return its contribution to the
// * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
// */
// double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel provides methods for setting and retrieving various state data: time, positions,
// * velocities, and forces.
// */
//class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
//public:
// CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaContext& cl) : UpdateStateDataKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Get the current time (in picoseconds).
// *
// * @param context the context in which to execute this kernel
// */
// double getTime(const ContextImpl& context) const;
// /**
// * Set the current time (in picoseconds).
// *
// * @param context the context in which to execute this kernel
// */
// void setTime(ContextImpl& context, double time);
// /**
// * Get the positions of all particles.
// *
// * @param positions on exit, this contains the particle positions
// */
// void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
// /**
// * Set the positions of all particles.
// *
// * @param positions a vector containg the particle positions
// */
// void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
// /**
// * Get the velocities of all particles.
// *
// * @param velocities on exit, this contains the particle velocities
// */
// void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
// /**
// * Set the velocities of all particles.
// *
// * @param velocities a vector containg the particle velocities
// */
// void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
// /**
// * Get the current forces on all particles.
// *
// * @param forces on exit, this contains the forces
// */
// void getForces(ContextImpl& context, std::vector<Vec3>& forces);
// /**
// * Get the current periodic box vectors.
// *
// * @param a on exit, this contains the vector defining the first edge of the periodic box
// * @param b on exit, this contains the vector defining the second edge of the periodic box
// * @param c on exit, this contains the vector defining the third edge of the periodic box
// */
// void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
// /**
// * Set the current periodic box vectors.
// *
// * @param a the vector defining the first edge of the periodic box
// * @param b the vector defining the second edge of the periodic box
// * @param c the vector defining the third edge of the periodic box
// */
// void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
// /**
// * Create a checkpoint recording the current state of the Context.
// *
// * @param stream an output stream the checkpoint data should be written to
// */
// void createCheckpoint(ContextImpl& context, std::ostream& stream);
// /**
// * Load a checkpoint that was written by createCheckpoint().
// *
// * @param stream an input stream the checkpoint data should be read from
// */
// void loadCheckpoint(ContextImpl& context, std::istream& stream);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel modifies the positions of particles to enforce distance constraints.
// */
//class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
//public:
// CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyConstraintsKernel(name, platform),
// cl(cl), hasInitializedKernel(false) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Update particle positions to enforce constraints.
// *
// * @param context the context in which to execute this kernel
// * @param tol the distance tolerance within which constraints must be satisfied.
// */
// void apply(ContextImpl& context, double tol);
//private:
// CudaContext& cl;
// bool hasInitializedKernel;
// cl::Kernel applyDeltasKernel;
//};
//
///**
// * This kernel recomputes the positions of virtual sites.
// */
//class CudaVirtualSitesKernel : public VirtualSitesKernel {
//public:
// CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cl) : VirtualSitesKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Compute the virtual site locations.
// *
// * @param context the context in which to execute this kernel
// */
// void computePositions(ContextImpl& context);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
//public:
// CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicBondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcHarmonicBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the HarmonicBondForce this kernel will be used for
// */
// void initialize(const System& system, const HarmonicBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float2>* params;
//};
//
///**
// * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
//public:
// CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomBondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomBondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
//public:
// CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicAngleForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcHarmonicAngleForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the HarmonicAngleForce this kernel will be used for
// */
// void initialize(const System& system, const HarmonicAngleForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numAngles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float2>* params;
//};
//
///**
// * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
//public:
// CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomAngleForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomAngleForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomAngleForce this kernel will be used for
// */
// void initialize(const System& system, const CustomAngleForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numAngles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
//public:
// CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcPeriodicTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcPeriodicTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the PeriodicTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const PeriodicTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float4>* params;
//};
//
///**
// * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
//public:
// CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcRBTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcRBTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the RBTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const RBTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float8>* params;
//};
//
///**
// * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
//public:
// CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCMAPTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), coefficients(NULL), mapPositions(NULL), torsionMaps(NULL) {
// }
// ~CudaCalcCMAPTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CMAPTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const CMAPTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float4>* coefficients;
// CudaArray<mm_int2>* mapPositions;
// CudaArray<cl_int>* torsionMaps;
//};
//
///**
// * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
//public:
// CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const CustomTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
// */
//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
//public:
// CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
// pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
// pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
// }
// ~CudaCalcNonbondedForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the NonbondedForce this kernel will be used for
// */
// void initialize(const System& system, const NonbondedForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @param includeDirect true if direct space interactions should be included
// * @param includeReciprocal true if reciprocal space interactions should be included
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the NonbondedForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
//private:
// struct SortTrait {
// typedef mm_int2 DataType;
// typedef cl_int KeyType;
// static const char* clDataType() {return "int2";}
// static const char* clKeyType() {return "int";}
// static const char* clMinKey() {return "INT_MIN";}
// static const char* clMaxKey() {return "INT_MAX";}
// static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
// static const char* clSortKey() {return "value.y";}
// };
// CudaContext& cl;
// bool hasInitializedKernel;
// CudaArray<mm_float2>* sigmaEpsilon;
// CudaArray<mm_float4>* exceptionParams;
// CudaArray<mm_float2>* cosSinSums;
// CudaArray<mm_float2>* pmeGrid;
// CudaArray<mm_float2>* pmeGrid2;
// CudaArray<cl_float>* pmeBsplineModuliX;
// CudaArray<cl_float>* pmeBsplineModuliY;
// CudaArray<cl_float>* pmeBsplineModuliZ;
// CudaArray<mm_float4>* pmeBsplineTheta;
// CudaArray<mm_float4>* pmeBsplineDTheta;
// CudaArray<cl_int>* pmeAtomRange;
// CudaArray<mm_int2>* pmeAtomGridIndex;
// CudaSort<SortTrait>* sort;
// CudaFFT3D* fft;
// cl::Kernel ewaldSumsKernel;
// cl::Kernel ewaldForcesKernel;
// cl::Kernel pmeGridIndexKernel;
// cl::Kernel pmeAtomRangeKernel;
// cl::Kernel pmeZIndexKernel;
// cl::Kernel pmeUpdateBsplinesKernel;
// cl::Kernel pmeSpreadChargeKernel;
// cl::Kernel pmeFinishSpreadChargeKernel;
// cl::Kernel pmeConvolutionKernel;
// cl::Kernel pmeInterpolateForceKernel;
// std::map<std::string, std::string> pmeDefines;
// double ewaldSelfEnergy, dispersionCoefficient, alpha;
// int interpolateForceThreads;
// bool hasCoulomb, hasLJ;
// static const int PmeOrder = 5;
//};
//
///**
// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
//public:
// CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomNonbondedForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomNonbondedForce this kernel will be used for
// */
// void initialize(const System& system, const CustomNonbondedForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// bool hasInitializedKernel;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
//};
//
///**
// * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
// */
//class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
//public:
// CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
// hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
// longBornForce(NULL), obcChain(NULL) {
// }
// ~CudaCalcGBSAOBCForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the GBSAOBCForce this kernel will be used for
// */
// void initialize(const System& system, const GBSAOBCForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// double prefactor;
// bool hasCreatedKernels;
// int maxTiles;
// CudaContext& cl;
// CudaArray<mm_float2>* params;
// CudaArray<cl_float>* bornSum;
// CudaArray<cl_long>* longBornSum;
// CudaArray<cl_float>* bornRadii;
// CudaArray<cl_float>* bornForce;
// CudaArray<cl_long>* longBornForce;
// CudaArray<cl_float>* obcChain;
// cl::Kernel computeBornSumKernel;
// cl::Kernel reduceBornSumKernel;
// cl::Kernel force1Kernel;
// cl::Kernel reduceBornForceKernel;
//};
//
///**
// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
//public:
// CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
// hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
// valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomGBForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomGBForce this kernel will be used for
// */
// void initialize(const System& system, const CustomGBForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// bool hasInitializedKernels, needParameterGradient;
// int maxTiles, numComputedValues;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaParameterSet* computedValues;
// CudaParameterSet* energyDerivs;
// CudaArray<cl_long>* longEnergyDerivs;
// CudaArray<cl_float>* globals;
// CudaArray<cl_float>* valueBuffers;
// CudaArray<cl_long>* longValueBuffers;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
// System& system;
// cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
//};
//
///**
// * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
//public:
// CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomExternalForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomExternalForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomExternalForce this kernel will be used for
// */
// void initialize(const System& system, const CustomExternalForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numParticles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
//public:
// CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomHbondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
// donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
// tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomHbondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomHbondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomHbondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numDonors, numAcceptors;
// bool hasInitializedKernel;
// CudaContext& cl;
// CudaParameterSet* donorParams;
// CudaParameterSet* acceptorParams;
// CudaArray<cl_float>* globals;
// CudaArray<mm_int4>* donors;
// CudaArray<mm_int4>* acceptors;
// CudaArray<mm_int4>* donorBufferIndices;
// CudaArray<mm_int4>* acceptorBufferIndices;
// CudaArray<mm_int4>* donorExclusions;
// CudaArray<mm_int4>* acceptorExclusions;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
// cl::Kernel donorKernel, acceptorKernel;
//};
//
///**
// * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
//public:
// CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomCompoundBondForceKernel(name, platform),
// cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomCompoundBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomCompoundBondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomCompoundBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
//};
//
///**
// * This kernel is invoked by VerletIntegrator to take one time step.
// */
//class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
//public:
// CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVerletStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false) {
// }
// ~CudaIntegrateVerletStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VerletIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VerletIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VerletIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const VerletIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevStepSize;
// bool hasInitializedKernels;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by LangevinIntegrator to take one time step.
// */
//class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
//public:
// CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), params(NULL) {
// }
// ~CudaIntegrateLangevinStepKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the LangevinIntegrator this kernel will be used for
// */
// void initialize(const System& system, const LangevinIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the LangevinIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const LangevinIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevTemp, prevFriction, prevStepSize;
// bool hasInitializedKernels;
// CudaArray<cl_float>* params;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by BrownianIntegrator to take one time step.
// */
//class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
//public:
// CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateBrownianStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), prevTemp(-1), prevFriction(-1), prevStepSize(-1) {
// }
// ~CudaIntegrateBrownianStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the BrownianIntegrator this kernel will be used for
// */
// void initialize(const System& system, const BrownianIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the BrownianIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const BrownianIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevTemp, prevFriction, prevStepSize;
// bool hasInitializedKernels;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by VariableVerletIntegrator to take one time step.
// */
//class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
//public:
// CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableVerletStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false) {
// }
// ~CudaIntegrateVariableVerletStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VerletIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VariableVerletIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VerletIntegrator this kernel is being used for
// * @param maxTime the maximum time beyond which the simulation should not be advanced
// * @return the size of the step that was taken
// */
// double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int blockSize;
// cl::Kernel kernel1, kernel2, selectSizeKernel;
//};
//
///**
// * This kernel is invoked by VariableLangevinIntegrator to take one time step.
// */
//class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
//public:
// CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableLangevinStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), params(NULL) {
// }
// ~CudaIntegrateVariableLangevinStepKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VariableLangevinIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VariableLangevinIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VariableLangevinIntegrator this kernel is being used for
// * @param maxTime the maximum time beyond which the simulation should not be advanced
// * @return the size of the step that was taken
// */
// double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int blockSize;
// CudaArray<cl_float>* params;
// cl::Kernel kernel1, kernel2, selectSizeKernel;
// double prevTemp, prevFriction, prevErrorTol;
//};
//
///**
// * This kernel is invoked by CustomIntegrator to take one time step.
// */
//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
//public:
// CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
// uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
// }
// ~CudaIntegrateCustomStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the CustomIntegrator this kernel will be used for
// */
// void initialize(const System& system, const CustomIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the CustomIntegrator this kernel is being used for
// * @param forcesAreValid if the context has been modified since the last time step, this will be
// * false to show that cached forces are invalid and must be recalculated.
// * On exit, this should specify whether the cached forces are valid at the
// * end of the step.
// */
// void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
// /**
// * Get the values of all global variables.
// *
// * @param context the context in which to execute this kernel
// * @param values on exit, this contains the values
// */
// void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
// /**
// * Set the values of all global variables.
// *
// * @param context the context in which to execute this kernel
// * @param values a vector containing the values
// */
// void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
// /**
// * Get the values of a per-DOF variable.
// *
// * @param context the context in which to execute this kernel
// * @param variable the index of the variable to get
// * @param values on exit, this contains the values
// */
// void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
// /**
// * Set the values of a per-DOF variable.
// *
// * @param context the context in which to execute this kernel
// * @param variable the index of the variable to get
// * @param values a vector containing the values
// */
// void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
//private:
// class ReorderListener;
// std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
// std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
// void recordChangedParameters(ContextImpl& context);
// CudaContext& cl;
// double prevStepSize;
// int numGlobalVariables;
// bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
// mutable bool localValuesAreCurrent;
// CudaArray<cl_float>* globalValues;
// CudaArray<cl_float>* contextParameterValues;
// CudaArray<cl_float>* sumBuffer;
// CudaArray<cl_float>* energy;
// CudaArray<mm_float4>* uniformRandoms;
// CudaArray<mm_int4>* randomSeed;
// CudaParameterSet* perDofValues;
// mutable std::vector<std::vector<cl_float> > localPerDofValues;
// std::vector<std::vector<cl::Kernel> > kernels;
// cl::Kernel sumEnergyKernel, randomKernel;
// std::vector<CustomIntegrator::ComputationType> stepType;
// std::vector<bool> needsForces;
// std::vector<bool> needsEnergy;
// std::vector<bool> invalidatesForces;
// std::vector<bool> merged;
// std::vector<int> forceGroup;
// std::vector<int> requiredGaussian;
// std::vector<int> requiredUniform;
// std::vector<std::string> parameterNames;
//};
//
///**
// * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
// */
//class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
//public:
// CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyAndersenThermostatKernel(name, platform), cl(cl),
// hasInitializedKernels(false), atomGroups(NULL) {
// }
// ~CudaApplyAndersenThermostatKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param thermostat the AndersenThermostat this kernel will be used for
// */
// void initialize(const System& system, const AndersenThermostat& thermostat);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// void execute(ContextImpl& context);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int randomSeed;
// CudaArray<cl_int>* atomGroups;
// cl::Kernel kernel;
//};
//
///**
// * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
// */
//class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
//public:
// CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
// hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
// }
// ~CudaApplyMonteCarloBarostatKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param barostat the MonteCarloBarostat this kernel will be used for
// */
// void initialize(const System& system, const MonteCarloBarostat& barostat);
// /**
// * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
// * This is called BEFORE the periodic box size is modified. It should begin by translating each particle
// * or cluster into the first periodic box, so that coordinates will still be correct after the box size
// * is changed.
// *
// * @param context the context in which to execute this kernel
// * @param scale the scale factor by which to multiply particle positions
// */
// void scaleCoordinates(ContextImpl& context, double scale);
// /**
// * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
// * scaleCoordinates() was last called.
// *
// * @param context the context in which to execute this kernel
// */
// void restoreCoordinates(ContextImpl& context);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int numMolecules;
// CudaArray<mm_float4>* savedPositions;
// CudaArray<cl_int>* moleculeAtoms;
// CudaArray<cl_int>* moleculeStartIndex;
// cl::Kernel kernel;
//};
//
///**
// * This kernel is invoked to calculate the kinetic energy of the system.
// */
//class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
//public:
// CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcKineticEnergyKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// double execute(ContextImpl& context);
//private:
// CudaContext& cl;
// std::vector<double> masses;
//};
//
///**
// * This kernel is invoked to remove center of mass motion from the system.
// */
//class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
//public:
// CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl), cmMomentum(NULL) {
// }
// ~CudaRemoveCMMotionKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param force the CMMotionRemover this kernel will be used for
// */
// void initialize(const System& system, const CMMotionRemover& force);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// void execute(ContextImpl& context);
//private:
// CudaContext& cl;
// int frequency;
// CudaArray<mm_float4>* cmMomentum;
// cl::Kernel kernel1, kernel2;
//};
}
// namespace OpenMM
#endif
/*OPENMM_CUDAKERNELS_H_*/
platforms/cuda2/src/CudaPlatform.cpp
0 → 100644
View file @
abb8cb4b
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaContext.h"
#include "CudaExpressionUtilities.h"
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include <algorithm>
#include <cctype>
#include <sstream>
using
namespace
OpenMM
;
using
namespace
std
;
extern
"C"
OPENMM_EXPORT
void
registerPlatforms
()
{
Platform
::
registerPlatform
(
new
CudaPlatform
());
}
CudaPlatform
::
CudaPlatform
()
{
CudaKernelFactory
*
factory
=
new
CudaKernelFactory
();
registerKernelFactory
(
CalcForcesAndEnergyKernel
::
Name
(),
factory
);
registerKernelFactory
(
UpdateStateDataKernel
::
Name
(),
factory
);
registerKernelFactory
(
ApplyConstraintsKernel
::
Name
(),
factory
);
registerKernelFactory
(
VirtualSitesKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcHarmonicBondForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomBondForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcHarmonicAngleForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomAngleForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcPeriodicTorsionForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcRBTorsionForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCMAPTorsionForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomTorsionForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcNonbondedForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomNonbondedForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcGBSAOBCForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomGBForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomExternalForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomHbondForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcCustomCompoundBondForceKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateVerletStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateLangevinStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateBrownianStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateVariableVerletStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateVariableLangevinStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
IntegrateCustomStepKernel
::
Name
(),
factory
);
registerKernelFactory
(
ApplyAndersenThermostatKernel
::
Name
(),
factory
);
registerKernelFactory
(
ApplyMonteCarloBarostatKernel
::
Name
(),
factory
);
registerKernelFactory
(
CalcKineticEnergyKernel
::
Name
(),
factory
);
registerKernelFactory
(
RemoveCMMotionKernel
::
Name
(),
factory
);
platformProperties
.
push_back
(
CudaDeviceIndex
());
platformProperties
.
push_back
(
CudaUseBlockingSync
());
platformProperties
.
push_back
(
CudaPrecision
());
platformProperties
.
push_back
(
CudaCompiler
());
platformProperties
.
push_back
(
CudaTempDirectory
());
setPropertyDefaultValue
(
CudaDeviceIndex
(),
""
);
setPropertyDefaultValue
(
CudaUseBlockingSync
(),
"true"
);
setPropertyDefaultValue
(
CudaPrecision
(),
"single"
);
#ifdef _MSC_VER
setPropertyDefaultValue
(
CudaCompiler
(),
"nvcc"
);
setPropertyDefaultValue
(
CudaTempDirectory
(),
string
(
getenv
(
"TEMP"
)));
#else
setPropertyDefaultValue
(
CudaCompiler
(),
"/usr/local/cuda/bin/nvcc"
);
setPropertyDefaultValue
(
CudaTempDirectory
(),
string
(
getenv
(
"TMPDIR"
)));
#endif
}
bool
CudaPlatform
::
supportsDoublePrecision
()
const
{
return
false
;
}
const
string
&
CudaPlatform
::
getPropertyValue
(
const
Context
&
context
,
const
string
&
property
)
const
{
const
ContextImpl
&
impl
=
getContextImpl
(
context
);
const
PlatformData
*
data
=
reinterpret_cast
<
const
PlatformData
*>
(
impl
.
getPlatformData
());
map
<
string
,
string
>::
const_iterator
value
=
data
->
propertyValues
.
find
(
property
);
if
(
value
!=
data
->
propertyValues
.
end
())
return
value
->
second
;
return
Platform
::
getPropertyValue
(
context
,
property
);
}
void
CudaPlatform
::
setPropertyValue
(
Context
&
context
,
const
string
&
property
,
const
string
&
value
)
const
{
}
void
CudaPlatform
::
contextCreated
(
ContextImpl
&
context
,
const
map
<
string
,
string
>&
properties
)
const
{
const
string
&
devicePropValue
=
(
properties
.
find
(
CudaDeviceIndex
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
CudaDeviceIndex
())
:
properties
.
find
(
CudaDeviceIndex
())
->
second
);
string
blockingPropValue
=
(
properties
.
find
(
CudaUseBlockingSync
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
CudaUseBlockingSync
())
:
properties
.
find
(
CudaUseBlockingSync
())
->
second
);
string
precisionPropValue
=
(
properties
.
find
(
CudaPrecision
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
CudaPrecision
())
:
properties
.
find
(
CudaPrecision
())
->
second
);
const
string
&
compilerPropValue
=
(
properties
.
find
(
CudaCompiler
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
CudaCompiler
())
:
properties
.
find
(
CudaCompiler
())
->
second
);
const
string
&
tempPropValue
=
(
properties
.
find
(
CudaTempDirectory
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
CudaTempDirectory
())
:
properties
.
find
(
CudaTempDirectory
())
->
second
);
transform
(
blockingPropValue
.
begin
(),
blockingPropValue
.
end
(),
blockingPropValue
.
begin
(),
::
tolower
);
transform
(
precisionPropValue
.
begin
(),
precisionPropValue
.
end
(),
precisionPropValue
.
begin
(),
::
tolower
);
context
.
setPlatformData
(
new
PlatformData
(
context
.
getSystem
(),
devicePropValue
,
blockingPropValue
,
precisionPropValue
,
compilerPropValue
,
tempPropValue
));
}
void
CudaPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
PlatformData
*
data
=
reinterpret_cast
<
PlatformData
*>
(
context
.
getPlatformData
());
delete
data
;
}
CudaPlatform
::
PlatformData
::
PlatformData
(
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
)
:
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
bool
blocking
=
(
blockingProperty
==
"true"
);
vector
<
string
>
devices
;
size_t
searchPos
=
0
,
nextPos
;
while
((
nextPos
=
deviceIndexProperty
.
find_first_of
(
", "
,
searchPos
))
!=
string
::
npos
)
{
devices
.
push_back
(
deviceIndexProperty
.
substr
(
searchPos
,
nextPos
-
searchPos
));
searchPos
=
nextPos
+
1
;
}
devices
.
push_back
(
deviceIndexProperty
.
substr
(
searchPos
));
for
(
int
i
=
0
;
i
<
(
int
)
devices
.
size
();
i
++
)
{
if
(
devices
[
i
].
length
()
>
0
)
{
unsigned
int
deviceIndex
;
stringstream
(
devices
[
i
])
>>
deviceIndex
;
contexts
.
push_back
(
new
CudaContext
(
system
,
deviceIndex
,
blocking
,
precisionProperty
,
compilerProperty
,
tempProperty
,
*
this
));
}
}
if
(
contexts
.
size
()
==
0
)
contexts
.
push_back
(
new
CudaContext
(
system
,
-
1
,
blocking
,
precisionProperty
,
compilerProperty
,
tempProperty
,
*
this
));
stringstream
device
;
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
{
if
(
i
>
0
)
device
<<
','
;
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
contextEnergy
.
resize
(
contexts
.
size
());
}
CudaPlatform
::
PlatformData
::~
PlatformData
()
{
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
delete
contexts
[
i
];
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->initialize();
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->getWorkThread().flush();
}
platforms/cuda2/src/kernels/utilities.cu
0 → 100644
View file @
abb8cb4b
/**
* This is called by the various functions below to clear a buffer.
*/
__device__
void
clearSingleBuffer
(
int
*
__restrict__
buffer
,
int
size
)
{
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int4
*
buffer4
=
(
int4
*
)
buffer
;
int
sizeDiv4
=
size
/
4
;
while
(
index
<
sizeDiv4
)
{
buffer4
[
index
]
=
make_int4
(
0
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
if
(
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
==
0
)
for
(
int
i
=
sizeDiv4
*
4
;
i
<
size
;
i
++
)
buffer
[
i
]
=
0
;
}
/**
* Fill a buffer with 0.
*/
__global__
void
clearBuffer
(
int
*
__restrict__
buffer
,
int
size
)
{
clearSingleBuffer
(
buffer
,
size
);
}
/**
* Fill two buffers with 0.
*/
__global__
void
clearTwoBuffers
(
int
*
__restrict__
buffer1
,
int
size1
,
int
*
__restrict__
buffer2
,
int
size2
)
{
clearSingleBuffer
(
buffer1
,
size1
);
clearSingleBuffer
(
buffer2
,
size2
);
}
/**
* Fill three buffers with 0.
*/
__global__
void
clearThreeBuffers
(
int
*
__restrict__
buffer1
,
int
size1
,
int
*
__restrict__
buffer2
,
int
size2
,
int
*
__restrict__
buffer3
,
int
size3
)
{
clearSingleBuffer
(
buffer1
,
size1
);
clearSingleBuffer
(
buffer2
,
size2
);
clearSingleBuffer
(
buffer3
,
size3
);
}
/**
* Fill four buffers with 0.
*/
__global__
void
clearFourBuffers
(
int
*
__restrict__
buffer1
,
int
size1
,
int
*
__restrict__
buffer2
,
int
size2
,
int
*
__restrict__
buffer3
,
int
size3
,
int
*
__restrict__
buffer4
,
int
size4
)
{
clearSingleBuffer
(
buffer1
,
size1
);
clearSingleBuffer
(
buffer2
,
size2
);
clearSingleBuffer
(
buffer3
,
size3
);
clearSingleBuffer
(
buffer4
,
size4
);
}
/**
* Fill five buffers with 0.
*/
__global__
void
clearFiveBuffers
(
int
*
__restrict__
buffer1
,
int
size1
,
int
*
__restrict__
buffer2
,
int
size2
,
int
*
__restrict__
buffer3
,
int
size3
,
int
*
__restrict__
buffer4
,
int
size4
,
int
*
__restrict__
buffer5
,
int
size5
)
{
clearSingleBuffer
(
buffer1
,
size1
);
clearSingleBuffer
(
buffer2
,
size2
);
clearSingleBuffer
(
buffer3
,
size3
);
clearSingleBuffer
(
buffer4
,
size4
);
clearSingleBuffer
(
buffer5
,
size5
);
}
/**
* Fill six buffers with 0.
*/
__global__
void
clearSixBuffers
(
int
*
__restrict__
buffer1
,
int
size1
,
int
*
__restrict__
buffer2
,
int
size2
,
int
*
__restrict__
buffer3
,
int
size3
,
int
*
__restrict__
buffer4
,
int
size4
,
int
*
__restrict__
buffer5
,
int
size5
,
int
*
__restrict__
buffer6
,
int
size6
)
{
clearSingleBuffer
(
buffer1
,
size1
);
clearSingleBuffer
(
buffer2
,
size2
);
clearSingleBuffer
(
buffer3
,
size3
);
clearSingleBuffer
(
buffer4
,
size4
);
clearSingleBuffer
(
buffer5
,
size5
);
clearSingleBuffer
(
buffer6
,
size6
);
}
/**
* Sum a collection of buffers into the first one.
*/
__global__
void
reduceFloat4Buffer
(
float4
*
__restrict__
buffer
,
int
bufferSize
,
int
numBuffers
)
{
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
totalSize
=
bufferSize
*
numBuffers
;
while
(
index
<
bufferSize
)
{
float4
sum
=
buffer
[
index
];
for
(
int
i
=
index
+
bufferSize
;
i
<
totalSize
;
i
+=
bufferSize
)
sum
+=
buffer
[
i
];
buffer
[
index
]
=
sum
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
/**
* Sum the various buffers containing forces.
*/
__global__
void
reduceForces
(
const
long
*
__restrict__
longBuffer
,
float4
*
__restrict__
buffer
,
int
bufferSize
,
int
numBuffers
)
{
int
totalSize
=
bufferSize
*
numBuffers
;
float
scale
=
1.0
f
/
(
float
)
0xFFFFFFFF
;
for
(
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
bufferSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
sum
=
make_float4
(
scale
*
longBuffer
[
index
],
scale
*
longBuffer
[
index
+
bufferSize
],
scale
*
longBuffer
[
index
+
2
*
bufferSize
],
0.0
f
);
for
(
int
i
=
index
;
i
<
totalSize
;
i
+=
bufferSize
)
sum
+=
buffer
[
i
];
buffer
[
index
]
=
sum
;
}
}
platforms/cuda2/src/kernels/vectorOps.cu
0 → 100644
View file @
abb8cb4b
/**
* This file defines vector operations to simplify code elsewhere.
*/
// Versions of make_x() that take a single value and set all components to that.
inline
__device__
int2
make_int2
(
int
a
)
{
return
make_int2
(
a
,
a
);
}
inline
__device__
int3
make_int3
(
int
a
)
{
return
make_int3
(
a
,
a
,
a
);
}
inline
__device__
int4
make_int4
(
int
a
)
{
return
make_int4
(
a
,
a
,
a
,
a
);
}
inline
__device__
float2
make_float2
(
float
a
)
{
return
make_float2
(
a
,
a
);
}
inline
__device__
float3
make_float3
(
float
a
)
{
return
make_float3
(
a
,
a
,
a
);
}
inline
__device__
float4
make_float4
(
float
a
)
{
return
make_float4
(
a
,
a
,
a
,
a
);
}
inline
__device__
double2
make_double2
(
double
a
)
{
return
make_double2
(
a
,
a
);
}
inline
__device__
double3
make_double3
(
double
a
)
{
return
make_double3
(
a
,
a
,
a
);
}
inline
__device__
double4
make_double4
(
double
a
)
{
return
make_double4
(
a
,
a
,
a
,
a
);
}
// Negate a vector.
inline
__device__
int2
operator
*
(
int2
a
)
{
return
make_int2
(
-
a
.
x
,
-
a
.
y
);
}
inline
__device__
int3
operator
-
(
int3
a
)
{
return
make_int3
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
);
}
inline
__device__
int4
operator
-
(
int4
a
)
{
return
make_int4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
inline
__device__
float2
operator
-
(
float2
a
)
{
return
make_float2
(
-
a
.
x
,
-
a
.
y
);
}
inline
__device__
float3
operator
-
(
float3
a
)
{
return
make_float3
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
);
}
inline
__device__
float4
operator
-
(
float4
a
)
{
return
make_float4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
inline
__device__
double2
operator
-
(
double2
a
)
{
return
make_double2
(
-
a
.
x
,
-
a
.
y
);
}
inline
__device__
double3
operator
-
(
double3
a
)
{
return
make_double3
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
);
}
inline
__device__
double4
operator
-
(
double4
a
)
{
return
make_double4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
// Add two vectors.
inline
__device__
int2
operator
+
(
int2
a
,
int2
b
)
{
return
make_int2
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
);
}
inline
__device__
int3
operator
+
(
int3
a
,
int3
b
)
{
return
make_int3
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
);
}
inline
__device__
int4
operator
+
(
int4
a
,
int4
b
)
{
return
make_int4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
inline
__device__
float2
operator
+
(
float2
a
,
float2
b
)
{
return
make_float2
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
);
}
inline
__device__
float3
operator
+
(
float3
a
,
float3
b
)
{
return
make_float3
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
);
}
inline
__device__
float4
operator
+
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
inline
__device__
double2
operator
+
(
double2
a
,
double2
b
)
{
return
make_double2
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
);
}
inline
__device__
double3
operator
+
(
double3
a
,
double3
b
)
{
return
make_double3
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
);
}
inline
__device__
double4
operator
+
(
double4
a
,
double4
b
)
{
return
make_double4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
// Subtract two vectors.
inline
__device__
int2
operator
-
(
int2
a
,
int2
b
)
{
return
make_int2
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
);
}
inline
__device__
int3
operator
-
(
int3
a
,
int3
b
)
{
return
make_int3
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
);
}
inline
__device__
int4
operator
-
(
int4
a
,
int4
b
)
{
return
make_int4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
inline
__device__
float2
operator
-
(
float2
a
,
float2
b
)
{
return
make_float2
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
);
}
inline
__device__
float3
operator
-
(
float3
a
,
float3
b
)
{
return
make_float3
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
);
}
inline
__device__
float4
operator
-
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
inline
__device__
double2
operator
-
(
double2
a
,
double2
b
)
{
return
make_double2
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
);
}
inline
__device__
double3
operator
-
(
double3
a
,
double3
b
)
{
return
make_double3
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
);
}
inline
__device__
double4
operator
-
(
double4
a
,
double4
b
)
{
return
make_double4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
// Multiply two vectors.
inline
__device__
int2
operator
*
(
int2
a
,
int2
b
)
{
return
make_int2
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
);
}
inline
__device__
int3
operator
*
(
int3
a
,
int3
b
)
{
return
make_int3
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
);
}
inline
__device__
int4
operator
*
(
int4
a
,
int4
b
)
{
return
make_int4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
*
b
.
w
);
}
inline
__device__
float2
operator
*
(
float2
a
,
float2
b
)
{
return
make_float2
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
);
}
inline
__device__
float3
operator
*
(
float3
a
,
float3
b
)
{
return
make_float3
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
);
}
inline
__device__
float4
operator
*
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
*
b
.
w
);
}
inline
__device__
double2
operator
*
(
double2
a
,
double2
b
)
{
return
make_double2
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
);
}
inline
__device__
double3
operator
*
(
double3
a
,
double3
b
)
{
return
make_double3
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
);
}
inline
__device__
double4
operator
*
(
double4
a
,
double4
b
)
{
return
make_double4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
*
b
.
w
);
}
// Divide two vectors.
inline
__device__
int2
operator
/
(
int2
a
,
int2
b
)
{
return
make_int2
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
);
}
inline
__device__
int3
operator
/
(
int3
a
,
int3
b
)
{
return
make_int3
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
);
}
inline
__device__
int4
operator
/
(
int4
a
,
int4
b
)
{
return
make_int4
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
,
a
.
w
/
b
.
w
);
}
inline
__device__
float2
operator
/
(
float2
a
,
float2
b
)
{
return
make_float2
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
);
}
inline
__device__
float3
operator
/
(
float3
a
,
float3
b
)
{
return
make_float3
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
);
}
inline
__device__
float4
operator
/
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
,
a
.
w
/
b
.
w
);
}
inline
__device__
double2
operator
/
(
double2
a
,
double2
b
)
{
return
make_double2
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
);
}
inline
__device__
double3
operator
/
(
double3
a
,
double3
b
)
{
return
make_double3
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
);
}
inline
__device__
double4
operator
/
(
double4
a
,
double4
b
)
{
return
make_double4
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
,
a
.
w
/
b
.
w
);
}
// += operator
inline
__device__
void
operator
+=
(
int2
&
a
,
int2
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
}
inline
__device__
void
operator
+=
(
int3
&
a
,
int3
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
}
inline
__device__
void
operator
+=
(
int4
&
a
,
int4
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
a
.
w
+=
b
.
w
;
}
inline
__device__
void
operator
+=
(
float2
&
a
,
float2
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
}
inline
__device__
void
operator
+=
(
float3
&
a
,
float3
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
}
inline
__device__
void
operator
+=
(
float4
&
a
,
float4
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
a
.
w
+=
b
.
w
;
}
inline
__device__
void
operator
+=
(
double2
&
a
,
double2
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
}
inline
__device__
void
operator
+=
(
double3
&
a
,
double3
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
}
inline
__device__
void
operator
+=
(
double4
&
a
,
double4
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
a
.
w
+=
b
.
w
;
}
// -= operator
inline
__device__
void
operator
-=
(
int2
&
a
,
int2
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
}
inline
__device__
void
operator
-=
(
int3
&
a
,
int3
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
}
inline
__device__
void
operator
-=
(
int4
&
a
,
int4
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
a
.
w
-=
b
.
w
;
}
inline
__device__
void
operator
-=
(
float2
&
a
,
float2
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
}
inline
__device__
void
operator
-=
(
float3
&
a
,
float3
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
}
inline
__device__
void
operator
-=
(
float4
&
a
,
float4
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
a
.
w
-=
b
.
w
;
}
inline
__device__
void
operator
-=
(
double2
&
a
,
double2
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
}
inline
__device__
void
operator
-=
(
double3
&
a
,
double3
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
}
inline
__device__
void
operator
-=
(
double4
&
a
,
double4
b
)
{
a
.
x
-=
b
.
x
;
a
.
y
-=
b
.
y
;
a
.
z
-=
b
.
z
;
a
.
w
-=
b
.
w
;
}
// *= operator
inline
__device__
void
operator
*=
(
int2
&
a
,
int2
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
}
inline
__device__
void
operator
*=
(
int3
&
a
,
int3
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
}
inline
__device__
void
operator
*=
(
int4
&
a
,
int4
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
a
.
w
*=
b
.
w
;
}
inline
__device__
void
operator
*=
(
float2
&
a
,
float2
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
}
inline
__device__
void
operator
*=
(
float3
&
a
,
float3
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
}
inline
__device__
void
operator
*=
(
float4
&
a
,
float4
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
a
.
w
*=
b
.
w
;
}
inline
__device__
void
operator
*=
(
double2
&
a
,
double2
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
}
inline
__device__
void
operator
*=
(
double3
&
a
,
double3
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
}
inline
__device__
void
operator
*=
(
double4
&
a
,
double4
b
)
{
a
.
x
*=
b
.
x
;
a
.
y
*=
b
.
y
;
a
.
z
*=
b
.
z
;
a
.
w
*=
b
.
w
;
}
// /= operator
inline
__device__
void
operator
/=
(
int2
&
a
,
int2
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
}
inline
__device__
void
operator
/=
(
int3
&
a
,
int3
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
}
inline
__device__
void
operator
/=
(
int4
&
a
,
int4
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
a
.
w
/=
b
.
w
;
}
inline
__device__
void
operator
/=
(
float2
&
a
,
float2
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
}
inline
__device__
void
operator
/=
(
float3
&
a
,
float3
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
}
inline
__device__
void
operator
/=
(
float4
&
a
,
float4
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
a
.
w
/=
b
.
w
;
}
inline
__device__
void
operator
/=
(
double2
&
a
,
double2
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
}
inline
__device__
void
operator
/=
(
double3
&
a
,
double3
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
}
inline
__device__
void
operator
/=
(
double4
&
a
,
double4
b
)
{
a
.
x
/=
b
.
x
;
a
.
y
/=
b
.
y
;
a
.
z
/=
b
.
z
;
a
.
w
/=
b
.
w
;
}
// Multiply a vector by a constant.
inline
__device__
int2
operator
*
(
int2
a
,
int
b
)
{
return
make_int2
(
a
.
x
*
b
,
a
.
y
*
b
);
}
inline
__device__
int3
operator
*
(
int3
a
,
int
b
)
{
return
make_int3
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
);
}
inline
__device__
int4
operator
*
(
int4
a
,
int
b
)
{
return
make_int4
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
,
a
.
w
*
b
);
}
inline
__device__
int2
operator
*
(
int
a
,
int2
b
)
{
return
make_int2
(
a
*
b
.
x
,
a
*
b
.
y
);
}
inline
__device__
int3
operator
*
(
int
a
,
int3
b
)
{
return
make_int3
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
);
}
inline
__device__
int4
operator
*
(
int
a
,
int4
b
)
{
return
make_int4
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
,
a
*
b
.
w
);
}
inline
__device__
float2
operator
*
(
float2
a
,
float
b
)
{
return
make_float2
(
a
.
x
*
b
,
a
.
y
*
b
);
}
inline
__device__
float3
operator
*
(
float3
a
,
float
b
)
{
return
make_float3
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
);
}
inline
__device__
float4
operator
*
(
float4
a
,
float
b
)
{
return
make_float4
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
,
a
.
w
*
b
);
}
inline
__device__
float2
operator
*
(
float
a
,
float2
b
)
{
return
make_float2
(
a
*
b
.
x
,
a
*
b
.
y
);
}
inline
__device__
float3
operator
*
(
float
a
,
float3
b
)
{
return
make_float3
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
);
}
inline
__device__
float4
operator
*
(
float
a
,
float4
b
)
{
return
make_float4
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
,
a
*
b
.
w
);
}
inline
__device__
double2
operator
*
(
double2
a
,
double
b
)
{
return
make_double2
(
a
.
x
*
b
,
a
.
y
*
b
);
}
inline
__device__
double3
operator
*
(
double3
a
,
double
b
)
{
return
make_double3
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
);
}
inline
__device__
double4
operator
*
(
double4
a
,
double
b
)
{
return
make_double4
(
a
.
x
*
b
,
a
.
y
*
b
,
a
.
z
*
b
,
a
.
w
*
b
);
}
inline
__device__
double2
operator
*
(
double
a
,
double2
b
)
{
return
make_double2
(
a
*
b
.
x
,
a
*
b
.
y
);
}
inline
__device__
double3
operator
*
(
double
a
,
double3
b
)
{
return
make_double3
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
);
}
inline
__device__
double4
operator
*
(
double
a
,
double4
b
)
{
return
make_double4
(
a
*
b
.
x
,
a
*
b
.
y
,
a
*
b
.
z
,
a
*
b
.
w
);
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment