Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1010df33
Commit
1010df33
authored
Feb 19, 2009
by
Peter Eastman
Browse files
Checking in Cuda implementation of explicit solvent
parent
df4b64cb
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1260 additions
and
1428 deletions
+1260
-1428
platforms/cuda/CMakeLists.txt
platforms/cuda/CMakeLists.txt
+97
-87
platforms/cuda/cudpp/mac/libcudpp.a
platforms/cuda/cudpp/mac/libcudpp.a
+0
-0
platforms/cuda/cudpp/mac/libcutil.a
platforms/cuda/cudpp/mac/libcutil.a
+0
-0
platforms/cuda/cudpp/win/cudpp.lib
platforms/cuda/cudpp/win/cudpp.lib
+0
-0
platforms/cuda/cudpp/win/cudpp32.dll
platforms/cuda/cudpp/win/cudpp32.dll
+0
-0
platforms/cuda/cudpp/win/cutil.lib
platforms/cuda/cudpp/win/cutil.lib
+0
-0
platforms/cuda/src/CudaStreamFactory.cpp
platforms/cuda/src/CudaStreamFactory.cpp
+9
-11
platforms/cuda/src/CudaStreamImpl.h
platforms/cuda/src/CudaStreamImpl.h
+14
-12
platforms/cuda/src/kernels/cudaKernels.h
platforms/cuda/src/kernels/cudaKernels.h
+4
-8
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+27
-2
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+567
-120
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+34
-9
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+93
-308
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+110
-374
platforms/cuda/src/kernels/kCalculateLocalForces.cu
platforms/cuda/src/kernels/kCalculateLocalForces.cu
+147
-41
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+76
-171
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+59
-279
platforms/cuda/src/kernels/kForces.cu
platforms/cuda/src/kernels/kForces.cu
+2
-2
platforms/cuda/src/kernels/kVerletUpdate.cu
platforms/cuda/src/kernels/kVerletUpdate.cu
+0
-3
platforms/reference/src/ReferenceKernels.cpp
platforms/reference/src/ReferenceKernels.cpp
+21
-1
No files found.
platforms/cuda/CMakeLists.txt
View file @
1010df33
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib
# Unix:
# libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a
#----------------------------------------------------
SUBDIRS
(
tests
)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET
(
OPENMM_SOURCE_SUBDIRS .
)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET
(
OPENMMCUDA_LIBRARY_NAME OpenMMCuda
)
SET
(
SHARED_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
)
SET
(
STATIC_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
_static
)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
SET
(
CMAKE_DEBUG_POSTFIX
"_d"
CACHE INTERNAL
""
FORCE
)
ENDIF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
# But on Unix or Cygwin we have to add the suffix manually
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
SHARED_TARGET
${
SHARED_TARGET
}
_d
)
SET
(
STATIC_TARGET
${
STATIC_TARGET
}
_d
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
# These are all the places to search for header files which are
# to be part of the API.
SET
(
API_INCLUDE_DIRS
)
# start empty
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
# append
SET
(
API_INCLUDE_DIRS
${
API_INCLUDE_DIRS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include/internal
)
ENDFOREACH
(
subdir
)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET
(
API_REL_INCLUDE_FILES
)
# start these out empty
SET
(
API_ABS_INCLUDE_FILES
)
FOREACH
(
dir
${
API_INCLUDE_DIRS
}
)
FILE
(
GLOB fullpaths
${
dir
}
/*.h
)
# returns full pathnames
SET
(
API_ABS_INCLUDE_FILES
${
API_ABS_INCLUDE_FILES
}
${
fullpaths
}
)
FOREACH
(
pathname
${
fullpaths
}
)
GET_FILENAME_COMPONENT
(
filename
${
pathname
}
NAME
)
SET
(
API_REL_INCLUDE_FILES
${
API_REL_INCLUDE_FILES
}
${
dir
}
/
${
filename
}
)
ENDFOREACH
(
pathname
)
ENDFOREACH
(
dir
)
# collect up source files
SET
(
SOURCE_FILES
)
# empty
SET
(
SOURCE_INCLUDE_FILES
)
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
FILE
(
GLOB src_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.cpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*/*.cpp
)
FILE
(
GLOB incl_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.h
)
SET
(
SOURCE_FILES
${
SOURCE_FILES
}
${
src_files
}
)
#append
SET
(
SOURCE_INCLUDE_FILES
${
SOURCE_INCLUDE_FILES
}
${
incl_files
}
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
SUBDIRS
(
sharedTarget staticTarget
)
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib
# Unix:
# libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a
#----------------------------------------------------
SUBDIRS
(
tests
)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET
(
OPENMM_SOURCE_SUBDIRS .
)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET
(
OPENMMCUDA_LIBRARY_NAME OpenMMCuda
)
SET
(
SHARED_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
)
SET
(
STATIC_TARGET
${
OPENMMCUDA_LIBRARY_NAME
}
_static
)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
SET
(
CMAKE_DEBUG_POSTFIX
"_d"
CACHE INTERNAL
""
FORCE
)
ENDIF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
# But on Unix or Cygwin we have to add the suffix manually
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
SHARED_TARGET
${
SHARED_TARGET
}
_d
)
SET
(
STATIC_TARGET
${
STATIC_TARGET
}
_d
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
# These are all the places to search for header files which are
# to be part of the API.
SET
(
API_INCLUDE_DIRS
)
# start empty
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
# append
SET
(
API_INCLUDE_DIRS
${
API_INCLUDE_DIRS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include/internal
)
ENDFOREACH
(
subdir
)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET
(
API_REL_INCLUDE_FILES
)
# start these out empty
SET
(
API_ABS_INCLUDE_FILES
)
FOREACH
(
dir
${
API_INCLUDE_DIRS
}
)
FILE
(
GLOB fullpaths
${
dir
}
/*.h
)
# returns full pathnames
SET
(
API_ABS_INCLUDE_FILES
${
API_ABS_INCLUDE_FILES
}
${
fullpaths
}
)
FOREACH
(
pathname
${
fullpaths
}
)
GET_FILENAME_COMPONENT
(
filename
${
pathname
}
NAME
)
SET
(
API_REL_INCLUDE_FILES
${
API_REL_INCLUDE_FILES
}
${
dir
}
/
${
filename
}
)
ENDFOREACH
(
pathname
)
ENDFOREACH
(
dir
)
# collect up source files
SET
(
SOURCE_FILES
)
# empty
SET
(
SOURCE_INCLUDE_FILES
)
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
FILE
(
GLOB src_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.cpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*/*.cpp
)
FILE
(
GLOB incl_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.h
)
SET
(
SOURCE_FILES
${
SOURCE_FILES
}
${
src_files
}
)
#append
SET
(
SOURCE_INCLUDE_FILES
${
SOURCE_INCLUDE_FILES
}
${
incl_files
}
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
IF
(
APPLE
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/mac
)
ELSE
(
APPLE
)
IF
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win
)
INSTALL_FILES
(
/lib FILES
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win/cudpp32.dll
)
ELSE
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/linux
)
ENDIF
(
WIN32
)
ENDIF
(
APPLE
)
SUBDIRS
(
sharedTarget staticTarget
)
platforms/cuda/cudpp/mac/libcudpp.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/mac/libcutil.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp32.dll
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cutil.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/src/CudaStreamFactory.cpp
View file @
1010df33
...
...
@@ -39,42 +39,40 @@
using
namespace
OpenMM
;
StreamImpl
*
CudaStreamFactory
::
createStreamImpl
(
std
::
string
name
,
int
size
,
Stream
::
DataType
type
,
const
Platform
&
platform
,
OpenMMContextImpl
&
context
)
const
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
if
(
name
==
"particlePositions"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
100000.0
f
,
100000.0
f
,
100000.0
f
,
0.2
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psPosq4
,
4
,
padding
,
data
.
gpu
);
}
if
(
name
==
"particleVelocities"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psVelm4
,
4
,
padding
,
data
.
gpu
);
}
if
(
name
==
"particleForces"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psForce4
,
4
,
padding
,
data
.
gpu
);
}
switch
(
type
)
{
case
Stream
::
Float
:
case
Stream
::
Double
:
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float2
:
case
Stream
::
Double2
:
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float3
:
case
Stream
::
Double3
:
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float4
:
case
Stream
::
Double4
:
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer
:
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer2
:
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer3
:
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer4
:
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
}
throw
OpenMMException
(
"Tried to create a Stream with an illegal DataType."
);
}
platforms/cuda/src/CudaStreamImpl.h
View file @
1010df33
...
...
@@ -55,7 +55,7 @@ public:
private:
void
initType
();
CUDAStream
<
T
>*
stream
;
_gpuContext
*
gpu
;
_gpuContext
*
gpu
;
bool
ownStream
;
int
width
,
rowOffset
;
float
paddingValues
[
4
];
...
...
@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template
<
class
T
>
void
CudaStreamImpl
<
T
>::
loadFromArray
(
const
void
*
array
)
{
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
order
[
i
]
*
width
+
j
];
}
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
else
{
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
for
(
int
i
=
getSize
();
i
<
(
int
)
stream
->
_length
;
++
i
)
for
(
int
j
=
0
;
j
<
rowOffset
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
paddingValues
[
j
];
stream
->
Upload
();
// VisualStudio compiler did not like stream == gpu->psPosq4
//if( gpu && stream == gpu->psPosq4 ){
// VisualStudio compiler did not like stream == gpu->psPosq4
//if( gpu && stream == gpu->psPosq4 ){
if
(
gpu
&&
getName
()
==
"particlePositions"
){
gpu
->
bRecalculateBornRadii
=
true
;
}
if
(
gpu
&&
getName
()
==
"particlePositions"
){
gpu
->
bRecalculateBornRadii
=
true
;
}
}
template
<
class
T
>
void
CudaStreamImpl
<
T
>::
saveToArray
(
void
*
array
)
{
stream
->
Download
();
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
else
{
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
}
}
...
...
platforms/cuda/src/kernels/cudaKernels.h
View file @
1010df33
...
...
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kApplyFirstSettle
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kApplySecondSettle
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
...
...
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
...
...
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetSettleSim
(
gpuContext
gpu
);
extern
void
GetSettleSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
...
...
platforms/cuda/src/kernels/cudatypes.h
View file @
1010df33
...
...
@@ -36,11 +36,12 @@
#include <limits>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using
namespace
std
;
#define RTERROR(status, s) \
if
(
status
!=
cudaSuccess
)
{
\
...
...
@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream()
template
<
typename
T
>
void
CUDAStream
<
T
>::
Allocate
()
{
cudaError_t
status
;
cudaError_t
status
;
_pSysStream
=
new
T
*
[
_subStreams
];
_pDevStream
=
new
T
*
[
_subStreams
];
_pSysData
=
new
T
[
_subStreams
*
_stride
];
...
...
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
enum
CudaNonbondedMethod
{
NO_CUTOFF
,
CUTOFF
,
PERIODIC
};
struct
cudaGmxSimulation
{
// Constants
...
...
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
interaction_blocks
;
// Number of blocks to launch when identifying interacting tiles
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
...
...
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
settle_threads_per_block
;
// Threads per block in SETTLE kernel calls
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
interaction_threads_per_block
;
// Threads per block when identifying interacting tiles
unsigned
int
workUnits
;
// Number of work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
*
pInteractingWorkUnit
;
// Pointer to work units that have interactions
unsigned
int
*
pInteractionFlag
;
// Flags for which work units have interactions
size_t
*
pInteractionCount
;
// A count of the number of work units which have interactions
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
...
...
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned
int
outputBuffers
;
// Number of output buffers
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
epsfac
;
// Epsilon factor for CDLJ calculations
CudaNonbondedMethod
nonbondedMethod
;
// How to handle nonbonded interactions
float
nonbondedCutoffSqr
;
// Cutoff distance for CDLJ calculations
float
periodicBoxSizeX
;
// The X dimension of the periodic box
float
periodicBoxSizeY
;
// The Y dimension of the periodic box
float
periodicBoxSizeZ
;
// The Z dimension of the periodic box
float
reactionFieldK
;
// Constant for reaction field correction
float
probeRadius
;
// SASA probe radius
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
electricConstant
;
// ACE approximation electric constant
...
...
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float
inverseTotalMass
;
// Used in linear momentum removal
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
settleConstraints
;
// Total number of Settle constraints
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
...
...
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int
*
pNonShakeID
;
// Not Shaking atoms
int4
*
pShakeID
;
// Shake atoms and phase
float4
*
pShakeParameter
;
// Shake parameters
int4
*
pSettleID
;
// Settle atoms
float2
*
pSettleParameter
;
// Settle parameters
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
int
*
pAtomIndex
;
// The original index of each atom
float4
*
pGridBoundingBox
;
// The size of each grid cell
float4
*
pGridCenter
;
// The center of each grid cell
// Mutable stuff
float4
*
pPosq
;
// Pointer to atom positions and charges
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
1010df33
...
...
@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
...
...
@@ -40,6 +41,7 @@
#include <ctime>
#include <cmath>
#include <map>
#include <algorithm>
#ifdef WIN32
#include <windows.h>
#else
...
...
@@ -148,6 +150,23 @@ struct ShakeCluster {
}
};
struct
Constraint
{
Constraint
(
int
atom1
,
int
atom2
,
float
distance2
)
:
atom1
(
atom1
),
atom2
(
atom2
),
distance2
(
distance2
)
{
}
int
atom1
,
atom2
;
float
distance2
;
};
struct
Molecule
{
vector
<
int
>
atoms
;
vector
<
int
>
bonds
;
vector
<
int
>
angles
;
vector
<
int
>
periodicTorsions
;
vector
<
int
>
rbTorsions
;
vector
<
int
>
constraints
;
};
static
const
float
dielectricOffset
=
0.009
f
;
static
const
float
PI
=
3.1415926535
f
;
static
const
float
probeRadius
=
0.14
f
;
...
...
@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
}
}
cout
<<
total_exclusions
<<
" total exclusions.
\n
"
;
gpuSetCoulombParameters
(
gpu
,
epsfac
,
atom
,
c6
,
c12
,
q
,
symbol
,
exclusions
);
gpuSetCoulombParameters
(
gpu
,
epsfac
,
atom
,
c6
,
c12
,
q
,
symbol
,
exclusions
,
NO_CUTOFF
);
gpuSetObcParameters
(
gpu
,
defaultInnerDielectric
,
defaultSolventDielectric
,
atom
,
radius
,
scale
);
return
coulombs
;
}
...
...
@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q
,
const
vector
<
char
>&
symbol
,
const
vector
<
vector
<
int
>
>&
exclusions
)
const
vector
<
char
>&
symbol
,
const
vector
<
vector
<
int
>
>&
exclusions
,
CudaNonbondedMethod
method
)
{
unsigned
int
coulombs
=
atom
.
size
();
gpu
->
sim
.
epsfac
=
epsfac
;
unsigned
int
total_exclusions
=
0
;
gpu
->
sim
.
nonbondedMethod
=
method
;
gpu
->
exclusions
=
exclusions
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
...
...
@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
p0
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
p1
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
p2
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
p0
<<
" "
<<
p1
<<
" "
<<
p2
<<
" "
<<
exclusions
;
#endif
for
(
int
j
=
0
;
j
<
(
int
)
exclusions
[
i
].
size
();
j
++
)
{
#if (DUMP_PARAMETERS == 1)
cout
<<
" "
<<
exclusions
[
i
][
j
];
#endif
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
exclusions
[
i
][
j
]]
=
0
;
if
(
i
>=
(
int
)
exclusions
[
i
][
j
])
{
total_exclusions
++
;
}
}
#if (DUMP_PARAMETERS == 1)
cout
<<
endl
;
#endif
}
// Dummy out extra atom data
...
...
@@ -868,36 +860,31 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
}
// Add in remaining exclusions
for
(
unsigned
int
i
=
coulombs
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
gpu
->
sim
.
paddedNumberOfAtoms
;
j
++
)
{
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
=
0
;
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
]
=
0
;
}
}
gpu
->
psPosq4
->
Upload
();
gpu
->
psSigEps2
->
Upload
();
}
// Check for exclusion consistency
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
for
(
unsigned
int
j
=
i
;
j
<
coulombs
;
j
++
)
{
extern
"C"
void
gpuSetNonbondedCutoff
(
gpuContext
gpu
,
float
cutoffDistance
,
float
solventDielectric
)
{
gpu
->
sim
.
nonbondedCutoffSqr
=
cutoffDistance
*
cutoffDistance
;
gpu
->
sim
.
reactionFieldK
=
pow
(
cutoffDistance
,
-
3.0
f
)
*
(
solventDielectric
-
1.0
f
)
/
(
2.0
f
*
solventDielectric
+
1.0
f
);
}
if
(
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
!=
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
])
cout
<<
"Warning: inconsistent exclusion betweens atoms "
<<
i
<<
" and "
<<
j
<<
endl
;
}
}
extern
"C"
void
gpuSetPeriodicBoxSize
(
gpuContext
gpu
,
float
xsize
,
float
ysize
,
float
zsize
)
{
gpu
->
sim
.
periodicBoxSizeX
=
xsize
;
gpu
->
sim
.
periodicBoxSizeY
=
ysize
;
gpu
->
sim
.
periodicBoxSizeZ
=
zsize
;
}
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
radius
,
const
vector
<
float
>&
scale
)
{
unsigned
int
atoms
=
atom
.
size
();
gpu
->
bIncludeGBSA
=
true
;
for
(
unsigned
int
i
=
0
;
i
<
atoms
;
i
++
)
{
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
radius
[
i
]
-
dielectricOffset
;
...
...
@@ -973,11 +960,91 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
constraintCount
[
atom1
[
i
]]
++
;
constraintCount
[
atom2
[
i
]]
++
;
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
vector
<
map
<
int
,
float
>
>
settleConstraints
(
gpu
->
natoms
);
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
if
(
constraintCount
[
atom1
[
i
]]
==
2
&&
constraintCount
[
atom2
[
i
]]
==
2
)
{
settleConstraints
[
atom1
[
i
]][
atom2
[
i
]]
=
distance
[
i
];
settleConstraints
[
atom2
[
i
]][
atom1
[
i
]]
=
distance
[
i
];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector
<
int
>
settleClusters
;
for
(
int
i
=
0
;
i
<
settleConstraints
.
size
();
i
++
)
{
if
(
settleConstraints
[
i
].
size
()
==
2
)
{
int
partner1
=
settleConstraints
[
i
].
begin
()
->
first
;
int
partner2
=
(
++
settleConstraints
[
i
].
begin
())
->
first
;
if
(
settleConstraints
[
partner1
].
size
()
!=
2
||
settleConstraints
[
partner2
].
size
()
!=
2
||
settleConstraints
[
partner1
].
find
(
partner2
)
==
settleConstraints
[
partner1
].
end
())
settleConstraints
[
i
].
clear
();
else
if
(
i
<
partner1
&&
i
<
partner2
)
settleClusters
.
push_back
(
i
);
}
else
settleConstraints
[
i
].
clear
();
}
// Record the actual SETTLE clusters.
CUDAStream
<
int4
>*
psSettleID
=
new
CUDAStream
<
int4
>
((
int
)
settleClusters
.
size
(),
1
);
gpu
->
psSettleID
=
psSettleID
;
gpu
->
sim
.
pSettleID
=
psSettleID
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psSettleParameter
=
new
CUDAStream
<
float2
>
((
int
)
settleClusters
.
size
(),
1
);
gpu
->
psSettleParameter
=
psSettleParameter
;
gpu
->
sim
.
pSettleParameter
=
psSettleParameter
->
_pDevStream
[
0
];
gpu
->
sim
.
settleConstraints
=
settleClusters
.
size
();
for
(
int
i
=
0
;
i
<
settleClusters
.
size
();
i
++
)
{
int
atom1
=
settleClusters
[
i
];
int
atom2
=
settleConstraints
[
atom1
].
begin
()
->
first
;
int
atom3
=
(
++
settleConstraints
[
atom1
].
begin
())
->
first
;
float
dist12
=
settleConstraints
[
atom1
].
find
(
atom2
)
->
second
;
float
dist13
=
settleConstraints
[
atom1
].
find
(
atom3
)
->
second
;
float
dist23
=
settleConstraints
[
atom2
].
find
(
atom3
)
->
second
;
if
(
dist12
==
dist13
)
{
// atom1 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom1
;
psSettleID
->
_pSysData
[
i
].
y
=
atom2
;
psSettleID
->
_pSysData
[
i
].
z
=
atom3
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist12
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist23
;
}
else
if
(
dist12
==
dist23
)
{
// atom2 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom2
;
psSettleID
->
_pSysData
[
i
].
y
=
atom1
;
psSettleID
->
_pSysData
[
i
].
z
=
atom3
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist12
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist13
;
}
else
if
(
dist13
==
dist23
)
{
// atom3 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom3
;
psSettleID
->
_pSysData
[
i
].
y
=
atom1
;
psSettleID
->
_pSysData
[
i
].
z
=
atom2
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist13
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist12
;
}
else
throw
OpenMMException
(
"Two of the three distances constrained with SETTLE must be the same."
);
}
psSettleID
->
Upload
();
psSettleParameter
->
Upload
();
gpu
->
sim
.
settle_threads_per_block
=
(
gpu
->
sim
.
settleConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
settle_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
settle_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
settle_threads_per_block
<
1
)
gpu
->
sim
.
settle_threads_per_block
=
1
;
// Find clusters consisting of a central atom with up to three peripheral atoms.
map
<
int
,
ShakeCluster
>
clusters
;
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
if
(
settleConstraints
[
atom1
[
i
]].
size
()
==
2
)
continue
;
// This is being taken care of with SETTLE.
// Determine which is the central atom.
bool
firstIsCentral
;
...
...
@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu
->
sim
.
degreesOfFreedom
=
3
*
gpu
->
sim
.
atoms
-
6
;
gpu
->
gpAtomTable
=
NULL
;
gpu
->
gAtomTypes
=
0
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
;
gpu
->
sim
.
totalNonbondOutputBuffers
=
2
*
gpu
->
sim
.
nonbondOutputBuffers
;
gpu
->
sim
.
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
gpu
->
psPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
gpu
->
sim
.
stride
*
2
;
...
...
@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu
->
psObcData
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcData
=
gpu
->
psObcData
->
_pDevStream
[
0
];
gpu
->
pAtomSymbol
=
new
unsigned
char
[
gpu
->
natoms
];
gpu
->
psAtomIndex
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pAtomIndex
=
gpu
->
psAtomIndex
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
gpu
->
psAtomIndex
->
_pSysStream
[
0
][
i
]
=
i
;
gpu
->
psAtomIndex
->
Upload
();
// Determine randoms
gpu
->
seed
=
(
unsigned
long
)
time
(
NULL
)
&
0x000fffff
;
gpu
->
sim
.
randomFrames
=
9
95
;
gpu
->
sim
.
randomFrames
=
95
;
gpu
->
sim
.
randomIterations
=
gpu
->
sim
.
randomFrames
;
gpu
->
sim
.
randoms
=
gpu
->
sim
.
randomFrames
*
gpu
->
sim
.
paddedNumberOfAtoms
-
5
*
GRID
;
gpu
->
sim
.
totalRandoms
=
gpu
->
sim
.
randoms
+
gpu
->
sim
.
paddedNumberOfAtoms
;
...
...
@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
{
sscanf
(
pAdapter
,
"%d"
,
&
device
);
}
cudaError_t
status
=
cudaSetDevice
(
device
);
RTERROR
(
status
,
"Error setting CUDA device"
)
//
cudaError_t status = cudaSetDevice(device);
//
RTERROR(status, "Error setting CUDA device")
// Determine which core to run on
#if 0
...
...
@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
gpu
->
sim
.
probeRadius
=
probeRadius
;
gpu
->
sim
.
surfaceAreaFactor
=
surfaceAreaFactor
;
gpu
->
sim
.
electricConstant
=
electricConstant
;
gpu
->
sim
.
nonbondedMethod
=
NO_CUTOFF
;
gpu
->
sim
.
nonbondedCutoffSqr
=
0.0
f
;
gpu
->
sim
.
bigFloat
=
99999999.0
f
;
gpu
->
sim
.
forceConversionFactor
=
forceConversionFactor
;
...
...
@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
false
;
gpu
->
bRecalculateBornRadii
=
true
;
gpu
->
bIncludeGBSA
=
false
;
gpuInitializeRandoms
(
gpu
);
// To be determined later
...
...
@@ -1489,18 +1560,20 @@ void* gpuInit(int numAtoms)
gpu
->
psLJ14Parameter
=
NULL
;
gpu
->
psShakeID
=
NULL
;
gpu
->
psShakeParameter
=
NULL
;
gpu
->
psSettleID
=
NULL
;
gpu
->
psSettleParameter
=
NULL
;
gpu
->
psExclusion
=
NULL
;
gpu
->
psWorkUnit
=
NULL
;
gpu
->
psInteractingWorkUnit
=
NULL
;
gpu
->
psInteractionFlag
=
NULL
;
gpu
->
psInteractionCount
=
NULL
;
gpu
->
psGridBoundingBox
=
NULL
;
gpu
->
psGridCenter
=
NULL
;
// Initialize output buffer before reading parameters
gpu
->
pOutputBufferCounter
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
];
memset
(
gpu
->
pOutputBufferCounter
,
0
,
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
unsigned
int
));
// Initialize exclusion array
gpu
->
pExclusion
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
];
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
gpu
->
pExclusion
[
i
]
=
1
;
return
(
void
*
)
gpu
;
}
...
...
@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
{
// Delete sysmem pointers
delete
[]
gpu
->
pOutputBufferCounter
;
delete
[]
gpu
->
pExclusion
;
delete
[]
gpu
->
gpAtomTable
;
delete
[]
gpu
->
pAtomSymbol
;
...
...
@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
delete
gpu
->
psLJ14Parameter
;
delete
gpu
->
psShakeID
;
delete
gpu
->
psShakeParameter
;
delete
gpu
->
psSettleID
;
delete
gpu
->
psSettleParameter
;
delete
gpu
->
psExclusion
;
delete
gpu
->
psWorkUnit
;
delete
gpu
->
psInteractingWorkUnit
;
delete
gpu
->
psInteractionFlag
;
delete
gpu
->
psInteractionCount
;
delete
gpu
->
psRandom4
;
delete
gpu
->
psRandom2
;
delete
gpu
->
psRandomPosition
;
delete
gpu
->
psRandomSeed
;
delete
gpu
->
psLinearMomentum
;
delete
gpu
->
psAtomIndex
;
delete
gpu
->
psGridBoundingBox
;
delete
gpu
->
psGridCenter
;
if
(
gpu
->
cudpp
!=
0
)
cudppDestroyPlan
(
gpu
->
cudpp
);
// Wrap up
delete
gpu
;
...
...
@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
)
{
// Select the number of output buffer to use.
gpu
->
bOutputBufferPerWarp
=
true
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
if
(
gpu
->
sim
.
nonbondOutputBuffers
>=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
)
{
// For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
gpu
->
bOutputBufferPerWarp
=
false
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
;
}
gpu
->
sim
.
totalNonbondOutputBuffers
=
(
gpu
->
bIncludeGBSA
?
2
*
gpu
->
sim
.
nonbondOutputBuffers
:
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
sim
.
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
unsigned
int
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
...
...
@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
const
unsigned
int
cells
=
dim
*
(
dim
+
1
)
/
2
;
const
unsigned
int
*
pExclusion
=
gpu
->
pExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
unsigned
int
*
pWorkList
=
psWorkUnit
->
_pSysStream
[
0
];
gpu
->
psWorkUnit
=
psWorkUnit
;
gpu
->
sim
.
pWorkUnit
=
psWorkUnit
->
_pDevStream
[
0
];
CUDAStream
<
unsigned
int
>*
psInteractingWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
gpu
->
psInteractingWorkUnit
=
psInteractingWorkUnit
;
gpu
->
sim
.
pInteractingWorkUnit
=
psInteractingWorkUnit
->
_pDevStream
[
0
];
CUDAStream
<
unsigned
int
>*
psInteractionFlag
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
gpu
->
psInteractionFlag
=
psInteractionFlag
;
gpu
->
sim
.
pInteractionFlag
=
psInteractionFlag
->
_pDevStream
[
0
];
CUDAStream
<
size_t
>*
psInteractionCount
=
new
CUDAStream
<
size_t
>
(
1
,
1u
);
gpu
->
psInteractionCount
=
psInteractionCount
;
gpu
->
sim
.
pInteractionCount
=
psInteractionCount
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psGridBoundingBox
=
new
CUDAStream
<
float4
>
(
dim
,
1u
);
gpu
->
psGridBoundingBox
=
psGridBoundingBox
;
gpu
->
sim
.
pGridBoundingBox
=
psGridBoundingBox
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psGridCenter
=
new
CUDAStream
<
float4
>
(
dim
,
1u
);
gpu
->
psGridCenter
=
psGridCenter
;
gpu
->
sim
.
pGridCenter
=
psGridCenter
->
_pDevStream
[
0
];
gpu
->
sim
.
nonbond_workBlock
=
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
workUnits
=
cells
;
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
int
minimumBlocks
=
(
cells
+
gpu
->
sim
.
workUnitsPerSM
-
1
)
/
gpu
->
sim
.
workUnitsPerSM
;
if
((
int
)
gpu
->
sim
.
nonbond_blocks
<
minimumBlocks
)
// Initialize the CUDPP workspace.
gpu
->
cudpp
=
0
;
CUDPPConfiguration
config
;
config
.
datatype
=
CUDPP_UINT
;
config
.
algorithm
=
CUDPP_COMPACT
;
config
.
options
=
CUDPP_OPTION_FORWARD
;
CUDPPResult
result
=
cudppPlan
(
&
gpu
->
cudpp
,
config
,
cells
,
1
,
0
);
if
(
CUDPP_SUCCESS
!=
result
)
{
gpu
->
sim
.
nonbond_blocks
=
gpu
->
sim
.
nonbond_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
nonbond_blocks
-
1
)
/
gpu
->
sim
.
nonbond_blocks
);
}
if
((
int
)
gpu
->
sim
.
bornForce2_blocks
<
minimumBlocks
)
{
gpu
->
sim
.
bornForce2_blocks
=
gpu
->
sim
.
bornForce2_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
bornForce2_blocks
-
1
)
/
gpu
->
sim
.
bornForce2_blocks
);
printf
(
"Error initializing CUDPP: %d
\n
"
,
result
);
exit
(
-
1
);
}
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
// if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
// {
// gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
// }
// if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// {
// gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
// }
gpu
->
sim
.
nbWorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nbWorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nbWorkUnitsPerBlock
;
gpu
->
sim
.
bf2WorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bf2WorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bf2WorkUnitsPerBlock
;
gpu
->
sim
.
interaction_threads_per_block
=
64
;
gpu
->
sim
.
interaction_blocks
=
(
gpu
->
sim
.
workUnits
+
gpu
->
sim
.
interaction_threads_per_block
-
1
)
/
gpu
->
sim
.
interaction_threads_per_block
;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
...
...
@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
for
(
unsigned
int
x
=
y
;
x
<
dim
;
x
++
)
{
pWorkList
[
count
]
=
(
x
<<
17
)
|
(
y
<<
2
);
// Check for exclusions
int
exclusions
=
0
;
for
(
unsigned
int
i
=
y
*
grid
;
i
<
y
*
grid
+
grid
;
i
++
)
{
for
(
unsigned
int
j
=
x
*
grid
;
j
<
x
*
grid
+
grid
;
j
++
)
{
if
(
!
pExclusion
[
i
*
atoms
+
j
])
{
exclusions
++
;
}
}
}
// Signal exclusions if they exist
if
(
exclusions
>
0
)
pWorkList
[
count
]
|=
0x1
;
count
++
;
}
}
...
...
@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
}
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
)
void
gpuBuildExclusionList
(
gpuContext
gpu
)
{
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
unsigned
int
>*
psExclusion
=
new
CUDAStream
<
unsigned
int
>
(
atoms
*
atoms
,
1u
);
const
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
CUDAStream
<
unsigned
int
>*
psExclusion
=
new
CUDAStream
<
unsigned
int
>
((
atoms
*
atoms
+
grid
-
1
)
/
grid
,
1u
);
gpu
->
psExclusion
=
psExclusion
;
gpu
->
sim
.
pExclusion
=
psExclusion
->
_pDevStream
[
0
];
unsigned
int
*
pExList
=
psExclusion
->
_pSysStream
[
0
];
int
exclusions
=
0
;
unsigned
int
pos
=
0
;
unsigned
int
*
pWorkList
=
gpu
->
psWorkUnit
->
_pSysStream
[
0
];
for
(
int
i
=
0
;
i
<
psExclusion
->
_length
;
++
i
)
pExList
[
i
]
=
0xFFFFFFFF
;
// Fill in the exclusions.
for
(
unsigned
int
x
=
0
;
x
<
atoms
;
x
+=
gpu
->
grid
)
for
(
int
atom1
=
0
;
atom1
<
gpu
->
exclusions
.
size
();
++
atom1
)
{
for
(
unsigned
int
y
=
0
;
y
<
atoms
;
y
+=
gpu
->
grid
)
{
for
(
unsigned
x1
=
x
;
x1
<
x
+
gpu
->
grid
;
x1
++
)
{
unsigned
int
mask
=
0
;
for
(
unsigned
int
y1
=
y
;
y1
<
y
+
gpu
->
grid
;
y1
++
)
{
mask
>>=
1
;
if
(
gpu
->
pExclusion
[
x1
*
atoms
+
y1
]
==
0
)
{
if
(
x1
>=
y1
)
exclusions
++
;
}
else
mask
|=
0x80000000
;
}
pExList
[
pos
++
]
=
mask
;
}
int
x
=
atom1
/
grid
;
int
offset
=
atom1
-
x
*
grid
;
for
(
int
j
=
0
;
j
<
gpu
->
exclusions
[
atom1
].
size
();
++
j
)
{
int
atom2
=
gpu
->
exclusions
[
atom1
][
j
];
int
y
=
atom2
/
grid
;
int
index
=
x
*
atoms
+
y
*
grid
+
offset
;
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom2
-
y
*
grid
));
int
cell
=
(
x
>
y
?
x
+
y
*
dim
-
y
*
(
y
+
1
)
/
2
:
y
+
x
*
dim
-
x
*
(
x
+
1
)
/
2
);
pWorkList
[
cell
]
|=
1
;
}
}
// Mark all interactions that involve a padding atom as being excluded.
for
(
int
atom1
=
gpu
->
natoms
;
atom1
<
atoms
;
++
atom1
)
{
int
x
=
atom1
/
grid
;
int
offset1
=
atom1
-
x
*
grid
;
for
(
int
atom2
=
0
;
atom2
<
atoms
;
++
atom2
)
{
int
y
=
atom2
/
grid
;
int
index
=
x
*
atoms
+
y
*
grid
+
offset1
;
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom2
-
y
*
grid
));
int
offset2
=
atom2
-
y
*
grid
;
index
=
y
*
atoms
+
x
*
grid
+
offset2
;
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom1
-
x
*
grid
));
int
cell
=
(
x
>
y
?
x
+
y
*
dim
-
y
*
(
y
+
1
)
/
2
:
y
+
x
*
dim
-
x
*
(
x
+
1
)
/
2
);
pWorkList
[
cell
]
|=
1
;
}
}
psExclusion
->
Upload
();
gpu
->
psWorkUnit
->
Upload
();
gpuSetConstants
(
gpu
);
return
exclusions
;
}
extern
"C"
...
...
@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
SetUpdateShakeHSim
(
gpu
);
SetVerletUpdateSim
(
gpu
);
SetBrownianUpdateSim
(
gpu
);
SetSettleSim
(
gpu
);
SetRandomSim
(
gpu
);
if
(
gpu
->
sm_version
>=
SM_12
)
{
SetCalculateCDLJForces_12Sim
(
gpu
);
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces2_12Sim
(
gpu
);
}
return
1
;
...
...
@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
);
}
}
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
vector
<
int
>&
atomMolecule
,
vector
<
vector
<
int
>
>&
atomBonds
)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule
[
atom
]
=
molecule
;
for
(
int
i
=
0
;
i
<
atomBonds
[
atom
].
size
();
i
++
)
if
(
atomMolecule
[
atomBonds
[
atom
][
i
]]
==
-
1
)
tagAtomsInMolecule
(
atomBonds
[
atom
][
i
],
molecule
,
atomMolecule
,
atomBonds
);
}
static
void
findMoleculeGroups
(
gpuContext
gpu
)
{
// First make a list of constraints for future use.
vector
<
Constraint
>
constraints
;
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
ShakeConstraints
;
i
++
)
{
int
atom1
=
gpu
->
psShakeID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psShakeID
->
_pSysData
[
i
].
y
;
int
atom3
=
gpu
->
psShakeID
->
_pSysData
[
i
].
z
;
int
atom4
=
gpu
->
psShakeID
->
_pSysData
[
i
].
w
;
float
distance2
=
gpu
->
psShakeParameter
->
_pSysData
[
i
].
z
;
constraints
.
push_back
(
Constraint
(
atom1
,
atom2
,
distance2
));
if
(
atom3
!=
-
1
)
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance2
));
if
(
atom4
!=
-
1
)
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance2
));
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
settleConstraints
;
i
++
)
{
int
atom1
=
gpu
->
psSettleID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psSettleID
->
_pSysData
[
i
].
y
;
int
atom3
=
gpu
->
psSettleID
->
_pSysData
[
i
].
z
;
float
distance12
=
gpu
->
psSettleParameter
->
_pSysData
[
i
].
x
;
float
distance23
=
gpu
->
psSettleParameter
->
_pSysData
[
i
].
y
;
constraints
.
push_back
(
Constraint
(
atom1
,
atom2
,
distance12
*
distance12
));
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance12
*
distance12
));
constraints
.
push_back
(
Constraint
(
atom2
,
atom3
,
distance23
*
distance23
));
}
// First make a list of every other atom to which each atom is connect by a bond or constraint.
int
numAtoms
=
gpu
->
natoms
;
vector
<
vector
<
int
>
>
atomBonds
(
numAtoms
);
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bonds
;
i
++
)
{
int
atom1
=
gpu
->
psBondID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psBondID
->
_pSysData
[
i
].
y
;
atomBonds
[
atom1
].
push_back
(
atom2
);
atomBonds
[
atom2
].
push_back
(
atom1
);
}
for
(
int
i
=
0
;
i
<
constraints
.
size
();
i
++
)
{
int
atom1
=
constraints
[
i
].
atom1
;
int
atom2
=
constraints
[
i
].
atom2
;
atomBonds
[
atom1
].
push_back
(
atom2
);
atomBonds
[
atom2
].
push_back
(
atom1
);
}
// Now tag atoms by which molecule they belong to.
vector
<
int
>
atomMolecule
(
numAtoms
,
-
1
);
int
numMolecules
=
0
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
if
(
atomMolecule
[
i
]
==
-
1
)
tagAtomsInMolecule
(
i
,
numMolecules
++
,
atomMolecule
,
atomBonds
);
vector
<
vector
<
int
>
>
atomIndices
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
atomIndices
[
atomMolecule
[
i
]].
push_back
(
i
);
// Construct a description of each molecule.
vector
<
Molecule
>
molecules
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
molecules
[
i
].
atoms
=
atomIndices
[
i
];
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bonds
;
i
++
)
{
int
atom1
=
gpu
->
psBondID
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
bonds
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bond_angles
;
i
++
)
{
int
atom1
=
gpu
->
psBondAngleID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
angles
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
dihedrals
;
i
++
)
{
int
atom1
=
gpu
->
psDihedralID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
periodicTorsions
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
rb_dihedrals
;
i
++
)
{
int
atom1
=
gpu
->
psRbDihedralID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
rbTorsions
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
constraints
.
size
();
i
++
)
{
molecules
[
atomMolecule
[
constraints
[
i
].
atom1
]].
constraints
.
push_back
(
i
);
}
// Sort them into groups of identical molecules.
vector
<
Molecule
>
uniqueMolecules
;
vector
<
vector
<
int
>
>
moleculeInstances
;
for
(
int
molIndex
=
0
;
molIndex
<
molecules
.
size
();
molIndex
++
)
{
Molecule
&
mol
=
molecules
[
molIndex
];
// See if it is identical to another molecule.
bool
isNew
=
true
;
for
(
int
j
=
0
;
j
<
uniqueMolecules
.
size
()
&&
isNew
;
j
++
)
{
Molecule
&
mol2
=
uniqueMolecules
[
j
];
bool
identical
=
true
;
if
(
mol
.
atoms
.
size
()
!=
mol2
.
atoms
.
size
()
||
mol
.
bonds
.
size
()
!=
mol2
.
bonds
.
size
()
||
mol
.
angles
.
size
()
!=
mol2
.
angles
.
size
()
||
mol
.
periodicTorsions
.
size
()
!=
mol2
.
periodicTorsions
.
size
()
||
mol
.
rbTorsions
.
size
()
!=
mol2
.
rbTorsions
.
size
()
||
mol
.
constraints
.
size
()
!=
mol2
.
constraints
.
size
())
identical
=
false
;
int
atomOffset
=
mol2
.
atoms
[
0
]
-
mol
.
atoms
[
0
];
float4
*
posq
=
gpu
->
psPosq4
->
_pSysData
;
float4
*
velm
=
gpu
->
psVelm4
->
_pSysData
;
float2
*
sigeps
=
gpu
->
psSigEps2
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
atoms
.
size
()
&&
identical
;
i
++
)
if
(
mol
.
atoms
[
i
]
!=
mol2
.
atoms
[
i
]
-
atomOffset
||
posq
[
mol
.
atoms
[
i
]].
w
!=
posq
[
mol2
.
atoms
[
i
]].
w
||
velm
[
mol
.
atoms
[
i
]].
w
!=
velm
[
mol2
.
atoms
[
i
]].
w
||
sigeps
[
mol
.
atoms
[
i
]].
x
!=
sigeps
[
mol2
.
atoms
[
i
]].
x
||
sigeps
[
mol
.
atoms
[
i
]].
y
!=
sigeps
[
mol2
.
atoms
[
i
]].
y
)
identical
=
false
;
int4
*
bondID
=
gpu
->
psBondID
->
_pSysData
;
float2
*
bondParam
=
gpu
->
psBondParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
bonds
.
size
()
&&
identical
;
i
++
)
if
(
bondID
[
mol
.
bonds
[
i
]].
x
!=
bondID
[
mol2
.
bonds
[
i
]].
x
-
atomOffset
||
bondID
[
mol
.
bonds
[
i
]].
y
!=
bondID
[
mol2
.
bonds
[
i
]].
y
-
atomOffset
||
bondParam
[
mol
.
bonds
[
i
]].
x
!=
bondParam
[
mol2
.
bonds
[
i
]].
x
||
bondParam
[
mol
.
bonds
[
i
]].
y
!=
bondParam
[
mol2
.
bonds
[
i
]].
y
)
identical
=
false
;
int4
*
angleID
=
gpu
->
psBondAngleID1
->
_pSysData
;
float2
*
angleParam
=
gpu
->
psBondAngleParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
angles
.
size
()
&&
identical
;
i
++
)
if
(
angleID
[
mol
.
angles
[
i
]].
x
!=
angleID
[
mol2
.
angles
[
i
]].
x
-
atomOffset
||
angleID
[
mol
.
angles
[
i
]].
y
!=
angleID
[
mol2
.
angles
[
i
]].
y
-
atomOffset
||
angleID
[
mol
.
angles
[
i
]].
z
!=
angleID
[
mol2
.
angles
[
i
]].
z
-
atomOffset
||
angleParam
[
mol
.
angles
[
i
]].
x
!=
angleParam
[
mol2
.
angles
[
i
]].
x
||
angleParam
[
mol
.
angles
[
i
]].
y
!=
angleParam
[
mol2
.
angles
[
i
]].
y
)
identical
=
false
;
int4
*
periodicID
=
gpu
->
psDihedralID1
->
_pSysData
;
float4
*
periodicParam
=
gpu
->
psDihedralParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
periodicTorsions
.
size
()
&&
identical
;
i
++
)
if
(
periodicID
[
mol
.
periodicTorsions
[
i
]].
x
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
x
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
y
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
y
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
z
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
z
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
w
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
w
-
atomOffset
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
x
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
x
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
y
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
y
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
z
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
z
)
identical
=
false
;
int4
*
rbID
=
gpu
->
psRbDihedralID1
->
_pSysData
;
float4
*
rbParam1
=
gpu
->
psRbDihedralParameter1
->
_pSysData
;
float2
*
rbParam2
=
gpu
->
psRbDihedralParameter2
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
rbTorsions
.
size
()
&&
identical
;
i
++
)
if
(
rbID
[
mol
.
rbTorsions
[
i
]].
x
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
x
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
y
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
y
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
z
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
z
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
w
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
w
-
atomOffset
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
x
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
x
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
y
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
y
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
z
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
z
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
w
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
w
||
rbParam2
[
mol
.
rbTorsions
[
i
]].
x
!=
rbParam2
[
mol2
.
rbTorsions
[
i
]].
x
||
rbParam2
[
mol
.
rbTorsions
[
i
]].
y
!=
rbParam2
[
mol2
.
rbTorsions
[
i
]].
y
)
identical
=
false
;
for
(
int
i
=
0
;
i
<
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
if
(
constraints
[
mol
.
constraints
[
i
]].
atom1
!=
constraints
[
mol2
.
constraints
[
i
]].
atom1
-
atomOffset
||
constraints
[
mol
.
constraints
[
i
]].
atom2
!=
constraints
[
mol2
.
constraints
[
i
]].
atom2
-
atomOffset
||
constraints
[
mol
.
constraints
[
i
]].
distance2
!=
constraints
[
mol2
.
constraints
[
i
]].
distance2
)
identical
=
false
;
if
(
identical
)
{
moleculeInstances
[
j
].
push_back
(
mol
.
atoms
[
0
]);
isNew
=
false
;
}
}
if
(
isNew
)
{
uniqueMolecules
.
push_back
(
mol
);
moleculeInstances
.
push_back
(
vector
<
int
>
());
moleculeInstances
[
moleculeInstances
.
size
()
-
1
].
push_back
(
mol
.
atoms
[
0
]);
}
}
gpu
->
moleculeGroups
.
resize
(
moleculeInstances
.
size
());
for
(
int
i
=
0
;
i
<
moleculeInstances
.
size
();
i
++
)
{
gpu
->
moleculeGroups
[
i
].
instances
=
moleculeInstances
[
i
];
vector
<
int
>&
atoms
=
uniqueMolecules
[
i
].
atoms
;
gpu
->
moleculeGroups
[
i
].
atoms
.
resize
(
atoms
.
size
());
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
gpu
->
moleculeGroups
[
i
].
atoms
[
j
]
=
atoms
[
j
]
-
atoms
[
0
];
}
}
extern
"C"
void
gpuReorderAtoms
(
gpuContext
gpu
)
{
if
(
gpu
->
natoms
==
0
||
gpu
->
sim
.
nonbondedCutoffSqr
==
0.0
)
return
;
if
(
gpu
->
moleculeGroups
.
size
()
==
0
)
findMoleculeGroups
(
gpu
);
// Find the range of positions and the number of bins along each axis.
int
numAtoms
=
gpu
->
natoms
;
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
float4
*
posq
=
gpu
->
psPosq4
->
_pSysData
;
float4
*
velm
=
gpu
->
psVelm4
->
_pSysData
;
float
minx
=
posq
[
0
].
x
,
maxx
=
posq
[
0
].
x
;
float
miny
=
posq
[
0
].
y
,
maxy
=
posq
[
0
].
y
;
float
minz
=
posq
[
0
].
z
,
maxz
=
posq
[
0
].
z
;
if
(
gpu
->
sim
.
nonbondedMethod
==
PERIODIC
)
{
minx
=
miny
=
minz
=
0.0
;
maxx
=
gpu
->
sim
.
periodicBoxSizeX
;
maxy
=
gpu
->
sim
.
periodicBoxSizeY
;
maxz
=
gpu
->
sim
.
periodicBoxSizeZ
;
}
else
{
for
(
int
i
=
1
;
i
<
numAtoms
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
i
].
x
);
maxx
=
max
(
maxx
,
posq
[
i
].
x
);
miny
=
min
(
miny
,
posq
[
i
].
y
);
maxy
=
max
(
maxy
,
posq
[
i
].
y
);
minz
=
min
(
minz
,
posq
[
i
].
z
);
maxz
=
max
(
maxz
,
posq
[
i
].
z
);
}
}
float
binWidth
=
0.2
*
sqrt
(
gpu
->
sim
.
nonbondedCutoffSqr
);
int
xbins
=
1
+
(
int
)
((
maxx
-
minx
)
/
binWidth
);
int
ybins
=
1
+
(
int
)
((
maxy
-
miny
)
/
binWidth
);
int
zbins
=
1
+
(
int
)
((
maxz
-
minz
)
/
binWidth
);
// Loop over each group of identical molecules and reorder them.
vector
<
int
>
originalIndex
(
numAtoms
);
vector
<
float4
>
newPosq
(
numAtoms
);
vector
<
float4
>
newVelm
(
numAtoms
);
for
(
int
group
=
0
;
group
<
gpu
->
moleculeGroups
.
size
();
group
++
)
{
// Find the center of each molecule.
gpuMoleculeGroup
&
mol
=
gpu
->
moleculeGroups
[
group
];
int
numMolecules
=
mol
.
instances
.
size
();
vector
<
int
>&
atoms
=
mol
.
atoms
;
vector
<
float3
>
molPos
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molPos
[
i
].
x
=
0.0
f
;
molPos
[
i
].
y
=
0.0
f
;
molPos
[
i
].
z
=
0.0
f
;
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
{
int
atom
=
atoms
[
j
]
+
mol
.
instances
[
i
];
molPos
[
i
].
x
+=
posq
[
atom
].
x
;
molPos
[
i
].
y
+=
posq
[
atom
].
y
;
molPos
[
i
].
z
+=
posq
[
atom
].
z
;
}
molPos
[
i
].
x
/=
atoms
.
size
();
molPos
[
i
].
y
/=
atoms
.
size
();
molPos
[
i
].
z
/=
atoms
.
size
();
}
if
(
gpu
->
sim
.
nonbondedMethod
==
PERIODIC
)
{
// Move each molecule position into the same box.
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molPos
[
i
].
x
-=
floor
(
molPos
[
i
].
x
/
gpu
->
sim
.
periodicBoxSizeX
)
*
gpu
->
sim
.
periodicBoxSizeX
;
molPos
[
i
].
y
-=
floor
(
molPos
[
i
].
y
/
gpu
->
sim
.
periodicBoxSizeY
)
*
gpu
->
sim
.
periodicBoxSizeY
;
molPos
[
i
].
z
-=
floor
(
molPos
[
i
].
z
/
gpu
->
sim
.
periodicBoxSizeZ
)
*
gpu
->
sim
.
periodicBoxSizeZ
;
}
}
// Select a bin for each molecule, then sort them by bin.
vector
<
pair
<
int
,
int
>
>
molBins
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
int
x
=
(
int
)
((
molPos
[
i
].
x
-
minx
)
/
binWidth
);
int
y
=
(
int
)
((
molPos
[
i
].
y
-
miny
)
/
binWidth
);
int
z
=
(
int
)
((
molPos
[
i
].
z
-
minz
)
/
binWidth
);
int
yodd
=
y
&
1
;
int
zodd
=
z
&
1
;
int
bin
=
z
*
xbins
*
ybins
;
bin
+=
(
zodd
?
ybins
-
y
:
y
)
*
xbins
;
bin
+=
(
yodd
?
xbins
-
x
:
x
);
molBins
[
i
]
=
pair
<
int
,
int
>
(
bin
,
i
);
}
sort
(
molBins
.
begin
(),
molBins
.
end
());
// Reorder the atoms.
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
{
int
oldIndex
=
mol
.
instances
[
molBins
[
i
].
second
]
+
atoms
[
j
];
int
newIndex
=
mol
.
instances
[
i
]
+
atoms
[
j
];
originalIndex
[
newIndex
]
=
gpu
->
psAtomIndex
->
_pSysStream
[
0
][
oldIndex
];
newPosq
[
newIndex
]
=
posq
[
oldIndex
];
newVelm
[
newIndex
]
=
velm
[
oldIndex
];
}
}
}
// Update the streams.
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
posq
[
i
]
=
newPosq
[
i
];
gpu
->
psPosq4
->
Upload
();
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
velm
[
i
]
=
newVelm
[
i
];
gpu
->
psVelm4
->
Upload
();
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
gpu
->
psAtomIndex
->
_pSysData
[
i
]
=
originalIndex
[
i
];
gpu
->
psAtomIndex
->
Upload
();
}
platforms/cuda/src/kernels/gputypes.h
View file @
1010df33
...
...
@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudpp.h"
#include <vector>
struct
gpuAtomType
{
string
name
;
std
::
string
name
;
char
symbol
;
float
r
;
};
struct
gpuMoleculeGroup
{
std
::
vector
<
int
>
atoms
;
std
::
vector
<
int
>
instances
;
};
enum
SM_VERSION
{
SM_10
,
...
...
@@ -61,8 +67,9 @@ struct _gpuContext {
int
gAtomTypes
;
cudaGmxSimulation
sim
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pE
xclusion
;
std
::
vector
<
std
::
vector
<
int
>
>
e
xclusion
s
;
unsigned
char
*
pAtomSymbol
;
std
::
vector
<
gpuMoleculeGroup
>
moleculeGroups
;
float
iterations
;
float
epsfac
;
float
solventDielectric
;
...
...
@@ -70,9 +77,12 @@ struct _gpuContext {
int
grid
;
bool
bCalculateCM
;
bool
bRemoveCM
;
bool
bRecalculateBornRadii
;
bool
bRecalculateBornRadii
;
bool
bOutputBufferPerWarp
;
bool
bIncludeGBSA
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
CUDPPHandle
cudpp
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
...
...
@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
int4
>*
psSettleID
;
CUDAStream
<
float2
>*
psSettleParameter
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractingWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractionFlag
;
CUDAStream
<
size_t
>*
psInteractionCount
;
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
CUDAStream
<
int
>*
psAtomIndex
;
// The original index of each atom
CUDAStream
<
float4
>*
psGridBoundingBox
;
// The size of each grid cell
CUDAStream
<
float4
>*
psGridCenter
;
// The center and radius for each grid cell
};
typedef
struct
_gpuContext
*
gpuContext
;
...
...
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
);
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
);
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
...
...
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
vector
<
int
>
>&
exclusions
);
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusions
,
CudaNonbondedMethod
method
);
extern
"C"
void
gpuSetNonbondedCutoff
(
gpuContext
gpu
,
float
cutoffDistance
,
float
solventDielectric
);
extern
"C"
void
gpuSetPeriodicBoxSize
(
gpuContext
gpu
,
float
xsize
,
float
ysize
,
float
zsize
);
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
...
...
@@ -227,7 +249,7 @@ extern "C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
);
void
gpuBuildExclusionList
(
gpuContext
gpu
);
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
);
...
...
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
extern
"C"
void
gpuReorderAtoms
(
gpuContext
gpu
);
#endif //__GPUTYPES_H__
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
View file @
1010df33
...
...
@@ -54,335 +54,120 @@ struct Atom {
float
fx
;
float
fy
;
float
fz
;
float
eps2
;
float
sig2
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJForces_kernel
()
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
__global__
extern
void
kCalculateCDLJCutoffForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult
result
;
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateCDLJN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
LAUNCHERROR
(
"kCalculateCDLJN2Forces"
);
break
;
case
CUTOFF
:
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
gpu
->
psInteractionCount
->
Download
();
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateCDLJCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
break
;
case
PERIODIC
:
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
}
pos
-=
cSim
.
nonbond_workBlock
;
gpu
->
psInteractionCount
->
Download
();
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJPeriodicByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateCDLJPeriodicForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
}
}
__global__
extern
void
kCalculateCDLJForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJForces"
);
}
\ No newline at end of file
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
View file @
1010df33
...
...
@@ -33,9 +33,6 @@
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
...
...
@@ -54,15 +51,8 @@ struct Atom {
float
fy
;
float
fz
;
float
fb
;
float
q2
;
float
junk
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
...
...
@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJObcGbsaForces1_kernel
()
// Include versions of the kernel for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern
__global__
void
kFindBlockBoundsCutoff_kernel
();
extern
__global__
void
kFindBlockBoundsPeriodic_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsCutoff_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsPeriodic_kernel
();
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// printf("kCalculateCDLJObcGbsaForces1\n");
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// check if Born radii need to be calculated
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
kClearBornForces
(
gpu
);
CUDPPResult
result
;
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bRecalculateBornRadii
)
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateCDLJObcGbsaN2Forces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
LAUNCHERROR
(
"kCalculateCDLJObcGbsaN2Forces1"
);
break
;
case
CUTOFF
:
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
}
pos
-=
cSim
.
nonbond_workBlock
;
gpu
->
psInteractionCount
->
Download
();
if
(
gpu
->
bRecalculateBornRadii
)
{
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
break
;
case
PERIODIC
:
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
gpu
->
psInteractionCount
->
Download
();
if
(
gpu
->
bRecalculateBornRadii
)
{
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
break
;
}
}
__global__
extern
void
kCalculateCDLJObcGbsaForces1_12_kernel
();
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if
(
gpu
->
bRecalculateBornRadii
){
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
// int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1"
);
}
platforms/cuda/src/kernels/kCalculateLocalForces.cu
View file @
1010df33
...
...
@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel()
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
if
(
cSim
.
nonbondedMethod
==
NO_CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
else
if
(
cSim
.
nonbondedMethod
==
CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
else
if
(
cSim
.
nonbondedMethod
==
PERIODIC
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
d
.
x
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
d
.
y
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
d
.
z
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
}
...
...
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
View file @
1010df33
...
...
@@ -53,10 +53,6 @@ struct Atom {
float
junk
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
...
...
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__
void
kClearObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
((
float
*
)
cSim
.
pBornSum
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kReduceObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
...
...
@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
__global__
void
kCalculateObcGbsaBornSum_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
(
blockIdx
.
x
*
cSim
.
workUnits
)
/
gridDim
.
x
;
int
end
=
((
blockIdx
.
x
+
1
)
*
cSim
.
workUnits
)
/
gridDim
.
x
;
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
-
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
j
].
r
-
r
))
{
apos
.
w
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
tj
].
sr
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
psA
[
tj
].
r
<
(
ar
.
y
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kClearObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornSum"
);
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateObcGbsaN2BornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
break
;
case
CUTOFF
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaCutoffBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateBornSum"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
View file @
1010df33
...
...
@@ -52,18 +52,9 @@ struct Atom {
float
fy
;
float
fz
;
float
fb
;
// float sum;
// float oneOverR;
int
pos
;
int
wx
;
int
wy
;
};
__shared__
Atom
sA
[
G8X_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
...
...
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces2_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
sA
[
threadIdx
.
x
].
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
sA
[
threadIdx
.
x
].
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
sA
[
threadIdx
.
x
].
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
sA
[
threadIdx
.
x
].
wx
=
x
;
sA
[
threadIdx
.
x
].
wy
=
y
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
// Include versions of the kernels for N^2 calculations.
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaForces2.h"
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born Forces and sum
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born Forces and sum
float
rScaledRadiusI
=
r
+
a
.
y
;
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
u_ij
=
1.0
f
/
rScaledRadiusI
;
l_ij2
=
l_ij
*
l_ij
;
u_ij2
=
u_ij
*
u_ij
;
t1
=
log
(
u_ij
/
l_ij
);
t2
=
(
l_ij2
-
u_ij2
);
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
term
=
0.125
f
*
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
sA
[
threadIdx
.
x
].
wx
+
tgx
+
(
sA
[
threadIdx
.
x
].
wy
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
sA
[
threadIdx
.
x
].
wy
+
tgx
+
(
sA
[
threadIdx
.
x
].
wx
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA
[
threadIdx
.
x
].
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
__global__
extern
void
kCalculateObcGbsaForces2_12_kernel
();
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces2\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateObcGbsaForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
else
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
//int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateObcGbsaN2Forces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
break
;
case
CUTOFF
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaCutoffForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
}
platforms/cuda/src/kernels/kForces.cu
View file @
1010df33
...
...
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__
void
kClearForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
4
*
cSim
.
outputBuffers
)
while
(
pos
<
cSim
.
stride
*
cSim
.
outputBuffers
)
{
((
float
*
)
cSim
.
pForce4
)
[
pos
]
=
0.0
f
;
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
)
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
...
...
platforms/cuda/src/kernels/kVerletUpdate.cu
View file @
1010df33
...
...
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__
void
kVerletUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
...
...
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__
void
kVerletUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
...
...
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
...
...
platforms/reference/src/ReferenceKernels.cpp
View file @
1010df33
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors.
*
* Portions copyright (c) 2008
-2009
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters
->
setScaledRadiusFactors
(
scaleFactors
);
obcParameters
->
setSolventDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSolventDielectric
())
);
obcParameters
->
setSoluteDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSoluteDielectric
())
);
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for
(
int
i
=
0
;
i
<
system
.
getNumForces
();
i
++
)
{
const
NonbondedForce
*
nonbonded
=
dynamic_cast
<
const
NonbondedForce
*>
(
&
system
.
getForce
(
i
));
if
(
nonbonded
!=
NULL
)
{
if
(
nonbonded
->
getNonbondedMethod
()
!=
NonbondedForce
::
NoCutoff
)
obcParameters
->
setUseCutoff
(
nonbonded
->
getCutoffDistance
());
if
(
nonbonded
->
getNonbondedMethod
()
==
NonbondedForce
::
CutoffPeriodic
)
{
Vec3
boxVectors
[
3
];
nonbonded
->
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
RealOpenMM
periodicBoxSize
[
3
];
periodicBoxSize
[
0
]
=
(
RealOpenMM
)
boxVectors
[
0
][
0
];
periodicBoxSize
[
1
]
=
(
RealOpenMM
)
boxVectors
[
1
][
1
];
periodicBoxSize
[
2
]
=
(
RealOpenMM
)
boxVectors
[
2
][
2
];
obcParameters
->
setPeriodic
(
periodicBoxSize
);
}
break
;
}
}
obc
=
new
CpuObc
(
obcParameters
);
obc
->
setIncludeAceApproximation
(
true
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment