Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
61d5cc0f
Commit
61d5cc0f
authored
Apr 16, 2015
by
Peter
Browse files
Merge branch 'master' into applecl
parents
e2999354
afae4bc8
Changes
248
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
772 additions
and
607 deletions
+772
-607
platforms/cuda/src/CudaArray.cpp
platforms/cuda/src/CudaArray.cpp
+3
-3
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+167
-27
platforms/cuda/src/CudaExpressionUtilities.cpp
platforms/cuda/src/CudaExpressionUtilities.cpp
+12
-6
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+42
-142
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+213
-42
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+22
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+79
-46
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+7
-16
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+2
-3
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+7
-8
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+24
-31
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+25
-32
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+8
-8
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+11
-19
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+2
-4
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+18
-29
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+16
-35
platforms/cuda/src/kernels/langevin.cu
platforms/cuda/src/kernels/langevin.cu
+1
-1
platforms/cuda/src/kernels/monteCarloBarostat.cu
platforms/cuda/src/kernels/monteCarloBarostat.cu
+5
-8
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+108
-124
No files found.
Too many changes to show.
To preserve performance only
248 of 248+
files are displayed.
Plain diff
Email patch
platforms/cuda/src/CudaArray.cpp
View file @
61d5cc0f
...
...
@@ -58,7 +58,7 @@ void CudaArray::upload(const void* data, bool blocking) {
if
(
blocking
)
result
=
cuMemcpyHtoD
(
pointer
,
data
,
size
*
elementSize
);
else
result
=
cuMemcpyHtoDAsync
(
pointer
,
data
,
size
*
elementSize
,
0
);
result
=
cuMemcpyHtoDAsync
(
pointer
,
data
,
size
*
elementSize
,
context
.
getCurrentStream
()
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error uploading array "
<<
name
<<
": "
<<
CudaContext
::
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
@@ -71,7 +71,7 @@ void CudaArray::download(void* data, bool blocking) const {
if
(
blocking
)
result
=
cuMemcpyDtoH
(
data
,
pointer
,
size
*
elementSize
);
else
result
=
cuMemcpyDtoHAsync
(
data
,
pointer
,
size
*
elementSize
,
0
);
result
=
cuMemcpyDtoHAsync
(
data
,
pointer
,
size
*
elementSize
,
context
.
getCurrentStream
()
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error downloading array "
<<
name
<<
": "
<<
CudaContext
::
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
@@ -82,7 +82,7 @@ void CudaArray::download(void* data, bool blocking) const {
void
CudaArray
::
copyTo
(
CudaArray
&
dest
)
const
{
if
(
dest
.
getSize
()
!=
size
||
dest
.
getElementSize
()
!=
elementSize
)
throw
OpenMMException
(
"Error copying array "
+
name
+
" to "
+
dest
.
getName
()
+
": The destination array does not match the size of the array"
);
CUresult
result
=
cuMemcpyDtoDAsync
(
dest
.
getDevicePointer
(),
pointer
,
size
*
elementSize
,
0
);
CUresult
result
=
cuMemcpyDtoDAsync
(
dest
.
getDevicePointer
(),
pointer
,
size
*
elementSize
,
context
.
getCurrentStream
()
);
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error copying array "
<<
name
<<
" to "
<<
dest
.
getName
()
<<
": "
<<
CudaContext
::
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
platforms/cuda/src/CudaContext.cpp
View file @
61d5cc0f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -33,6 +33,7 @@
#include "CudaBondedUtilities.h"
#include "CudaForceInfo.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "SHA1.h"
...
...
@@ -72,10 +73,19 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
const
string
&
tempDir
,
const
std
::
string
&
hostCompiler
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
stepsSinceReorder
(
99999
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
pinnedBuffer
(
NULL
),
posq
(
NULL
),
posqCorrection
(
NULL
),
velm
(
NULL
),
force
(
NULL
),
energyBuffer
(
NULL
),
integration
(
NULL
),
expression
(
NULL
),
bonded
(
NULL
),
nonbonded
(
NULL
),
thread
(
NULL
)
{
const
string
&
tempDir
,
const
std
::
string
&
hostCompiler
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
currentStream
(
0
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
stepsSinceReorder
(
99999
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
hasCompilerKernel
(
false
),
pinnedBuffer
(
NULL
),
posq
(
NULL
),
posqCorrection
(
NULL
),
velm
(
NULL
),
force
(
NULL
),
energyBuffer
(
NULL
),
integration
(
NULL
),
expression
(
NULL
),
bonded
(
NULL
),
nonbonded
(
NULL
),
thread
(
NULL
)
{
this
->
compiler
=
"
\"
"
+
compiler
+
"
\"
"
;
if
(
platformData
.
context
!=
NULL
)
{
try
{
compilerKernel
=
platformData
.
context
->
getPlatform
().
createKernel
(
CudaCompilerKernel
::
Name
(),
*
platformData
.
context
);
hasCompilerKernel
=
true
;
}
catch
(...)
{
// The runtime compiler plugin isn't available.
}
}
if
(
hostCompiler
.
size
()
>
0
)
this
->
compiler
=
compiler
+
" --compiler-bindir "
+
hostCompiler
;
if
(
!
hasInitializedCuda
)
{
...
...
@@ -136,6 +146,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
this
->
deviceIndex
=
deviceIndex
;
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if
(
major
==
5
)
minor
=
0
;
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
computeCapability
=
major
+
0.1
*
minor
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
computeCapability
<
1.3
)
...
...
@@ -149,6 +164,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
int
canAccess
;
cuDeviceCanAccessPeer
(
&
canAccess
,
getDevice
(),
platformData
.
contexts
[
0
]
->
getDevice
());
if
(
canAccess
)
{
platformData
.
contexts
[
0
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
getContext
(),
0
));
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
platformData
.
contexts
[
0
]
->
getContext
(),
0
));
}
}
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
...
...
@@ -218,6 +243,66 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines
[
"ERF"
]
=
useDoublePrecision
?
"erf"
:
"erff"
;
compilationDefines
[
"ERFC"
]
=
useDoublePrecision
?
"erfc"
:
"erfcf"
;
// Set defines for applying periodic boundary conditions.
Vec3
boxVectors
[
3
];
system
.
getDefaultPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
boxIsTriclinic
=
(
boxVectors
[
0
][
1
]
!=
0.0
||
boxVectors
[
0
][
2
]
!=
0.0
||
boxVectors
[
1
][
0
]
!=
0.0
||
boxVectors
[
1
][
2
]
!=
0.0
||
boxVectors
[
2
][
0
]
!=
0.0
||
boxVectors
[
2
][
1
]
!=
0.0
);
if
(
boxIsTriclinic
)
{
compilationDefines
[
"APPLY_PERIODIC_TO_DELTA(delta)"
]
=
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f);
\\\n
"
"delta.x -= scale3*periodicBoxVecZ.x;
\\\n
"
"delta.y -= scale3*periodicBoxVecZ.y;
\\\n
"
"delta.z -= scale3*periodicBoxVecZ.z;
\\\n
"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f);
\\\n
"
"delta.x -= scale2*periodicBoxVecY.x;
\\\n
"
"delta.y -= scale2*periodicBoxVecY.y;
\\\n
"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f);
\\\n
"
"delta.x -= scale1*periodicBoxVecX.x;}"
;
compilationDefines
[
"APPLY_PERIODIC_TO_POS(pos)"
]
=
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z);
\\\n
"
"pos.x -= scale3*periodicBoxVecZ.x;
\\\n
"
"pos.y -= scale3*periodicBoxVecZ.y;
\\\n
"
"pos.z -= scale3*periodicBoxVecZ.z;
\\\n
"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y);
\\\n
"
"pos.x -= scale2*periodicBoxVecY.x;
\\\n
"
"pos.y -= scale2*periodicBoxVecY.y;
\\\n
"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x);
\\\n
"
"pos.x -= scale1*periodicBoxVecX.x;}"
;
compilationDefines
[
"APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"
]
=
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f);
\\\n
"
"pos.x -= scale3*periodicBoxVecZ.x;
\\\n
"
"pos.y -= scale3*periodicBoxVecZ.y;
\\\n
"
"pos.z -= scale3*periodicBoxVecZ.z;
\\\n
"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f);
\\\n
"
"pos.x -= scale2*periodicBoxVecY.x;
\\\n
"
"pos.y -= scale2*periodicBoxVecY.y;
\\\n
"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f);
\\\n
"
"pos.x -= scale1*periodicBoxVecX.x;}"
;
}
else
{
compilationDefines
[
"APPLY_PERIODIC_TO_DELTA(delta)"
]
=
"{"
"delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
\\\n
"
"delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
\\\n
"
"delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}"
;
compilationDefines
[
"APPLY_PERIODIC_TO_POS(pos)"
]
=
"{"
"pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
\\\n
"
"pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
\\\n
"
"pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}"
;
compilationDefines
[
"APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"
]
=
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
\\\n
"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
\\\n
"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}"
;
}
// Create the work thread used for parallelization when running on multiple devices.
thread
=
new
WorkThread
();
...
...
@@ -433,7 +518,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if
(
cuModuleLoad
(
&
module
,
cacheFile
.
str
().
c_str
())
==
CUDA_SUCCESS
)
return
module
;
//
Write out the source to a
temporary file.
//
Select names for the various
temporary file
s
.
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
...
...
@@ -445,6 +530,36 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
string
inputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".cu"
);
string
outputFile
=
(
tempDir
+
tempFileName
.
str
()
+
".ptx"
);
string
logFile
=
(
tempDir
+
tempFileName
.
str
()
+
".log"
);
int
res
=
0
;
// If the runtime compiler plugin is available, use it.
if
(
hasCompilerKernel
)
{
string
ptx
=
compilerKernel
.
getAs
<
CudaCompilerKernel
>
().
createModule
(
src
.
str
(),
"-arch=compute_"
+
gpuArchitecture
+
" "
+
options
,
*
this
);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool
wroteCache
=
false
;
try
{
ofstream
out
(
outputFile
.
c_str
());
out
<<
ptx
;
out
.
close
();
if
(
!
out
.
fail
())
wroteCache
=
true
;
}
catch
(...)
{
// Ignore.
}
if
(
!
wroteCache
)
{
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2
(
cuModuleLoadDataEx
(
&
module
,
&
ptx
[
0
],
0
,
NULL
,
NULL
),
"Error loading CUDA module"
);
return
module
;
}
}
else
{
// Write out the source to a temporary file.
ofstream
out
(
inputFile
.
c_str
());
out
<<
src
.
str
();
out
.
close
();
...
...
@@ -457,8 +572,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int
res
=
compileInWindows
(
command
);
#else
string
command
=
compiler
+
" --ptx --machine "
+
bits
+
" -arch=sm_"
+
gpuArchitecture
+
" -o
\"
"
+
outputFile
+
"
\"
"
+
options
+
"
\"
"
+
inputFile
+
"
\"
2>
\"
"
+
logFile
+
"
\"
"
;
int
res
=
std
::
system
(
command
.
c_str
());
res
=
std
::
system
(
command
.
c_str
());
#endif
}
try
{
if
(
res
!=
0
)
{
// Load the error log.
...
...
@@ -507,7 +623,19 @@ CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
return
function
;
}
string
CudaContext
::
doubleToString
(
double
value
)
{
CUstream
CudaContext
::
getCurrentStream
()
{
return
currentStream
;
}
void
CudaContext
::
setCurrentStream
(
CUstream
stream
)
{
currentStream
=
stream
;
}
void
CudaContext
::
restoreDefaultStream
()
{
setCurrentStream
(
0
);
}
string
CudaContext
::
doubleToString
(
double
value
)
const
{
stringstream
s
;
s
.
precision
(
useDoublePrecision
?
16
:
8
);
s
<<
scientific
<<
value
;
...
...
@@ -516,7 +644,7 @@ string CudaContext::doubleToString(double value) {
return
s
.
str
();
}
string
CudaContext
::
intToString
(
int
value
)
{
string
CudaContext
::
intToString
(
int
value
)
const
{
stringstream
s
;
s
<<
value
;
return
s
.
str
();
...
...
@@ -550,6 +678,7 @@ std::string CudaContext::getErrorString(CUresult result) {
case
CUDA_ERROR_ECC_UNCORRECTABLE
:
return
"CUDA_ERROR_ECC_UNCORRECTABLE"
;
case
CUDA_ERROR_UNSUPPORTED_LIMIT
:
return
"CUDA_ERROR_UNSUPPORTED_LIMIT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_IN_USE
:
return
"CUDA_ERROR_CONTEXT_ALREADY_IN_USE"
;
case
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED
:
return
"CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"
;
case
CUDA_ERROR_INVALID_SOURCE
:
return
"CUDA_ERROR_INVALID_SOURCE"
;
case
CUDA_ERROR_FILE_NOT_FOUND
:
return
"CUDA_ERROR_FILE_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
:
return
"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"
;
...
...
@@ -566,16 +695,22 @@ std::string CudaContext::getErrorString(CUresult result) {
case
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"
;
case
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
:
return
"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"
;
case
CUDA_ERROR_CONTEXT_IS_DESTROYED
:
return
"CUDA_ERROR_CONTEXT_IS_DESTROYED"
;
case
CUDA_ERROR_ASSERT
:
return
"CUDA_ERROR_ASSERT"
;
case
CUDA_ERROR_TOO_MANY_PEERS
:
return
"CUDA_ERROR_TOO_MANY_PEERS"
;
case
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
:
return
"CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"
;
case
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
:
return
"CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"
;
case
CUDA_ERROR_NOT_PERMITTED
:
return
"CUDA_ERROR_NOT_PERMITTED"
;
case
CUDA_ERROR_NOT_SUPPORTED
:
return
"CUDA_ERROR_NOT_SUPPORTED"
;
case
CUDA_ERROR_UNKNOWN
:
return
"CUDA_ERROR_UNKNOWN"
;
}
return
"
Invalid
error
code
"
;
return
"
CUDA
error"
;
}
void
CudaContext
::
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
threads
,
int
blockSize
,
unsigned
int
sharedSize
)
{
if
(
blockSize
==
-
1
)
blockSize
=
ThreadBlockSize
;
int
gridSize
=
std
::
min
((
threads
+
blockSize
-
1
)
/
blockSize
,
numThreadBlocks
);
CUresult
result
=
cuLaunchKernel
(
kernel
,
gridSize
,
1
,
1
,
blockSize
,
1
,
1
,
sharedSize
,
0
,
arguments
,
NULL
);
CUresult
result
=
cuLaunchKernel
(
kernel
,
gridSize
,
1
,
1
,
blockSize
,
1
,
1
,
sharedSize
,
currentStream
,
arguments
,
NULL
);
if
(
result
!=
CUDA_SUCCESS
)
{
stringstream
str
;
str
<<
"Error invoking kernel: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
@@ -1044,16 +1179,21 @@ void CudaContext::reorderAtomsImpl() {
// Move each molecule position into the same box.
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
int
xcell
=
(
int
)
floor
(
molPos
[
i
].
x
*
invPeriodicBoxSize
.
x
);
int
ycell
=
(
int
)
floor
(
molPos
[
i
].
y
*
invPeriodicBoxSize
.
y
);
int
zcell
=
(
int
)
floor
(
molPos
[
i
].
z
*
invPeriodicBoxSize
.
z
);
Real
dx
=
xcell
*
periodicBoxSize
.
x
;
Real
dy
=
ycell
*
periodicBoxSize
.
y
;
Real
dz
=
zcell
*
periodicBoxSize
.
z
;
if
(
dx
!=
0.0
f
||
dy
!=
0.0
f
||
dz
!=
0.0
f
)
{
molPos
[
i
].
x
-=
dx
;
molPos
[
i
].
y
-=
dy
;
molPos
[
i
].
z
-=
dz
;
Real4
center
=
molPos
[
i
];
int
zcell
=
(
int
)
floor
(
center
.
z
*
invPeriodicBoxSize
.
z
);
center
.
x
-=
zcell
*
periodicBoxVecZ
.
x
;
center
.
y
-=
zcell
*
periodicBoxVecZ
.
y
;
center
.
z
-=
zcell
*
periodicBoxVecZ
.
z
;
int
ycell
=
(
int
)
floor
(
center
.
y
*
invPeriodicBoxSize
.
y
);
center
.
x
-=
ycell
*
periodicBoxVecY
.
x
;
center
.
y
-=
ycell
*
periodicBoxVecY
.
y
;
int
xcell
=
(
int
)
floor
(
center
.
x
*
invPeriodicBoxSize
.
x
);
center
.
x
-=
xcell
*
periodicBoxVecX
.
x
;
if
(
xcell
!=
0
||
ycell
!=
0
||
zcell
!=
0
)
{
Real
dx
=
molPos
[
i
].
x
-
center
.
x
;
Real
dy
=
molPos
[
i
].
y
-
center
.
y
;
Real
dz
=
molPos
[
i
].
z
-
center
.
z
;
molPos
[
i
]
=
center
;
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
{
int
atom
=
atoms
[
j
]
+
mol
.
offsets
[
i
];
Real4
p
=
oldPosq
[
atom
];
...
...
platforms/cuda/src/CudaExpressionUtilities.cpp
View file @
61d5cc0f
...
...
@@ -239,7 +239,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if
(
derivOrder
[
0
]
==
0
)
{
out
<<
"real x = "
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
";
\n
"
;
out
<<
"if (x >= 0 && x < "
<<
paramsInt
[
0
]
<<
") {
\n
"
;
out
<<
"int index = (int)
round(x
);
\n
"
;
out
<<
"int index = (int)
floor(x+0.5f
);
\n
"
;
out
<<
nodeNames
[
j
]
<<
" = "
<<
functionNames
[
i
].
second
<<
"[index];
\n
"
;
out
<<
"}
\n
"
;
}
...
...
@@ -249,8 +249,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
for
(
int
j
=
0
;
j
<
nodes
.
size
();
j
++
)
{
const
vector
<
int
>&
derivOrder
=
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
nodes
[
j
]
->
getOperation
())
->
getDerivOrder
();
if
(
derivOrder
[
0
]
==
0
&&
derivOrder
[
1
]
==
0
)
{
out
<<
"int x = (int)
round
("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
");
\n
"
;
out
<<
"int y = (int)
round
("
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
");
\n
"
;
out
<<
"int x = (int)
floor
("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"
+0.5f
);
\n
"
;
out
<<
"int y = (int)
floor
("
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
"
+0.5f
);
\n
"
;
out
<<
"int xsize = (int) "
<<
paramsInt
[
0
]
<<
";
\n
"
;
out
<<
"int ysize = (int) "
<<
paramsInt
[
1
]
<<
";
\n
"
;
out
<<
"int index = x+y*xsize;
\n
"
;
...
...
@@ -263,9 +263,9 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
for
(
int
j
=
0
;
j
<
nodes
.
size
();
j
++
)
{
const
vector
<
int
>&
derivOrder
=
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
nodes
[
j
]
->
getOperation
())
->
getDerivOrder
();
if
(
derivOrder
[
0
]
==
0
&&
derivOrder
[
1
]
==
0
&&
derivOrder
[
2
]
==
0
)
{
out
<<
"int x = (int)
round
("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
");
\n
"
;
out
<<
"int y = (int)
round
("
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
");
\n
"
;
out
<<
"int z = (int)
round
("
<<
getTempName
(
node
.
getChildren
()[
2
],
temps
)
<<
");
\n
"
;
out
<<
"int x = (int)
floor
("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
"
+0.5f
);
\n
"
;
out
<<
"int y = (int)
floor
("
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
"
+0.5f
);
\n
"
;
out
<<
"int z = (int)
floor
("
<<
getTempName
(
node
.
getChildren
()[
2
],
temps
)
<<
"
+0.5f
);
\n
"
;
out
<<
"int xsize = (int) "
<<
paramsInt
[
0
]
<<
";
\n
"
;
out
<<
"int ysize = (int) "
<<
paramsInt
[
1
]
<<
";
\n
"
;
out
<<
"int zsize = (int) "
<<
paramsInt
[
2
]
<<
";
\n
"
;
...
...
@@ -457,6 +457,12 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
case
Operation
::
ABS
:
out
<<
"fabs("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
FLOOR
:
out
<<
"floor("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
CEIL
:
out
<<
"ceil("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
default:
throw
OpenMMException
(
"Internal error: Unknown operation in user-defined expression: "
+
node
.
getOperation
().
getName
());
}
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
61d5cc0f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
4
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -27,10 +27,12 @@
#include "CudaIntegrationUtilities.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "openmm/internal/OSRngSeed.h"
#include "openmm/HarmonicAngleForce.h"
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "CudaExpressionUtilities.h"
#include "ReferenceCCMAAlgorithm.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
...
...
@@ -303,157 +305,54 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
int
numCCMA
=
(
int
)
ccmaConstraints
.
size
();
if
(
numCCMA
>
0
)
{
vector
<
vector
<
int
>
>
atomConstraints
(
context
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
atomConstraints
[
atom1
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
atomConstraints
[
atom2
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
}
vector
<
vector
<
int
>
>
linkedConstraints
(
numCCMA
);
for
(
unsigned
atom
=
0
;
atom
<
atomConstraints
.
size
();
atom
++
)
{
for
(
unsigned
i
=
0
;
i
<
atomConstraints
[
atom
].
size
();
i
++
)
for
(
unsigned
j
=
0
;
j
<
i
;
j
++
)
{
int
c1
=
atomConstraints
[
atom
][
i
];
int
c2
=
atomConstraints
[
atom
][
j
];
linkedConstraints
[
c1
].
push_back
(
c2
);
linkedConstraints
[
c2
].
push_back
(
c1
);
}
}
int
maxLinks
=
0
;
for
(
unsigned
i
=
0
;
i
<
linkedConstraints
.
size
();
i
++
)
maxLinks
=
max
(
maxLinks
,
(
int
)
linkedConstraints
[
i
].
size
());
int
maxAtomConstraints
=
0
;
for
(
unsigned
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
maxAtomConstraints
=
max
(
maxAtomConstraints
,
(
int
)
atomConstraints
[
i
].
size
());
// Record information needed by ReferenceCCMAAlgorithm.
// Compute the constraint coupling matrix
vector
<
vector
<
int
>
>
atomAngles
(
numAtoms
);
HarmonicAngleForce
const
*
angleForce
=
NULL
;
for
(
int
i
=
0
;
i
<
system
.
getNumForces
()
&&
angleForce
==
NULL
;
i
++
)
angleForce
=
dynamic_cast
<
HarmonicAngleForce
const
*>
(
&
system
.
getForce
(
i
));
if
(
angleForce
!=
NULL
)
for
(
int
i
=
0
;
i
<
angleForce
->
getNumAngles
();
i
++
)
{
int
particle1
,
particle2
,
particle3
;
vector
<
pair
<
int
,
int
>
>
refIndices
(
numCCMA
);
vector
<
RealOpenMM
>
refDistance
(
numCCMA
);
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
int
index
=
ccmaConstraints
[
i
];
refIndices
[
i
]
=
make_pair
(
atom1
[
index
],
atom2
[
index
]);
refDistance
[
i
]
=
distance
[
index
];
}
vector
<
RealOpenMM
>
refMasses
(
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
++
i
)
refMasses
[
i
]
=
(
RealOpenMM
)
system
.
getParticleMass
(
i
);
// Look up angles for CCMA.
vector
<
ReferenceCCMAAlgorithm
::
AngleInfo
>
angles
;
for
(
int
i
=
0
;
i
<
system
.
getNumForces
();
i
++
)
{
const
HarmonicAngleForce
*
force
=
dynamic_cast
<
const
HarmonicAngleForce
*>
(
&
system
.
getForce
(
i
));
if
(
force
!=
NULL
)
{
for
(
int
j
=
0
;
j
<
force
->
getNumAngles
();
j
++
)
{
int
atom1
,
atom2
,
atom3
;
double
angle
,
k
;
angleForce
->
getAngleParameters
(
i
,
particle1
,
particle2
,
particle3
,
angle
,
k
);
atomAngles
[
particle2
].
push_back
(
i
);
}
vector
<
vector
<
pair
<
int
,
double
>
>
>
matrix
(
numCCMA
);
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
{
for
(
int
k
=
0
;
k
<
numCCMA
;
k
++
)
{
if
(
j
==
k
)
{
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
j
,
1.0
));
continue
;
}
double
scale
;
int
cj
=
ccmaConstraints
[
j
];
int
ck
=
ccmaConstraints
[
k
];
int
atomj0
=
atom1
[
cj
];
int
atomj1
=
atom2
[
cj
];
int
atomk0
=
atom1
[
ck
];
int
atomk1
=
atom2
[
ck
];
int
atoma
,
atomb
,
atomc
;
double
imj0
=
1.0
/
system
.
getParticleMass
(
atomj0
);
double
imj1
=
1.0
/
system
.
getParticleMass
(
atomj1
);
if
(
atomj0
==
atomk0
)
{
atoma
=
atomj1
;
atomb
=
atomj0
;
atomc
=
atomk1
;
scale
=
imj0
/
(
imj0
+
imj1
);
}
else
if
(
atomj1
==
atomk1
)
{
atoma
=
atomj0
;
atomb
=
atomj1
;
atomc
=
atomk0
;
scale
=
imj1
/
(
imj0
+
imj1
);
}
else
if
(
atomj0
==
atomk1
)
{
atoma
=
atomj1
;
atomb
=
atomj0
;
atomc
=
atomk0
;
scale
=
imj0
/
(
imj0
+
imj1
);
}
else
if
(
atomj1
==
atomk0
)
{
atoma
=
atomj0
;
atomb
=
atomj1
;
atomc
=
atomk1
;
scale
=
imj1
/
(
imj0
+
imj1
);
}
else
continue
;
// These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
bool
foundConstraint
=
false
;
for
(
int
m
=
0
;
m
<
numCCMA
;
m
++
)
{
int
other
=
ccmaConstraints
[
m
];
if
((
atom1
[
other
]
==
atoma
&&
atom2
[
other
]
==
atomc
)
||
(
atom1
[
other
]
==
atomc
&&
atom2
[
other
]
==
atoma
))
{
double
d1
=
distance
[
cj
];
double
d2
=
distance
[
ck
];
double
d3
=
distance
[
other
];
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
k
,
scale
*
(
d1
*
d1
+
d2
*
d2
-
d3
*
d3
)
/
(
2.0
*
d1
*
d2
)));
foundConstraint
=
true
;
break
;
}
}
if
(
!
foundConstraint
&&
angleForce
!=
NULL
)
{
// We didn't find one, so look for an angle force field term.
const
vector
<
int
>&
angleCandidates
=
atomAngles
[
atomb
];
for
(
vector
<
int
>::
const_iterator
iter
=
angleCandidates
.
begin
();
iter
!=
angleCandidates
.
end
();
iter
++
)
{
int
particle1
,
particle2
,
particle3
;
double
angle
,
ka
;
angleForce
->
getAngleParameters
(
*
iter
,
particle1
,
particle2
,
particle3
,
angle
,
ka
);
if
((
particle1
==
atoma
&&
particle3
==
atomc
)
||
(
particle3
==
atoma
&&
particle1
==
atomc
))
{
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
k
,
scale
*
cos
(
angle
)));
break
;
}
force
->
getAngleParameters
(
j
,
atom1
,
atom2
,
atom3
,
angle
,
k
);
angles
.
push_back
(
ReferenceCCMAAlgorithm
::
AngleInfo
(
atom1
,
atom2
,
atom3
,
(
RealOpenMM
)
angle
));
}
}
}
}
// Invert it using QR.
vector
<
int
>
matrixRowStart
;
vector
<
int
>
matrixColIndex
;
vector
<
double
>
matrixValue
;
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
matrixRowStart
.
push_back
(
matrixValue
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
matrix
[
i
].
size
();
j
++
)
{
pair
<
int
,
double
>
element
=
matrix
[
i
][
j
];
matrixColIndex
.
push_back
(
element
.
first
);
matrixValue
.
push_back
(
element
.
second
);
}
}
matrixRowStart
.
push_back
(
matrixValue
.
size
());
int
*
qRowStart
,
*
qColIndex
,
*
rRowStart
,
*
rColIndex
;
double
*
qValue
,
*
rValue
;
int
result
=
QUERN_compute_qr
(
numCCMA
,
numCCMA
,
&
matrixRowStart
[
0
],
&
matrixColIndex
[
0
],
&
matrixValue
[
0
],
NULL
,
&
qRowStart
,
&
qColIndex
,
&
qValue
,
&
rRowStart
,
&
rColIndex
,
&
rValue
);
vector
<
double
>
rhs
(
numCCMA
);
matrix
.
clear
();
matrix
.
resize
(
numCCMA
);
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
// Extract column i of the inverse matrix.
// Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us.
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
rhs
[
j
]
=
(
i
==
j
?
1.0
:
0.0
);
result
=
QUERN_multiply_with_q_transpose
(
numCCMA
,
qRowStart
,
qColIndex
,
qValue
,
&
rhs
[
0
]);
result
=
QUERN_solve_with_r
(
numCCMA
,
rRowStart
,
rColIndex
,
rValue
,
&
rhs
[
0
],
&
rhs
[
0
]);
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
{
double
value
=
rhs
[
j
]
*
distance
[
ccmaConstraints
[
i
]]
/
distance
[
ccmaConstraints
[
j
]];
if
(
abs
(
value
)
>
0.1
)
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
i
,
value
));
}
}
QUERN_free_result
(
qRowStart
,
qColIndex
,
qValue
);
QUERN_free_result
(
rRowStart
,
rColIndex
,
rValue
);
ReferenceCCMAAlgorithm
ccma
(
numAtoms
,
numCCMA
,
refIndices
,
refDistance
,
refMasses
,
angles
,
0.1
);
vector
<
vector
<
pair
<
int
,
double
>
>
>
matrix
=
ccma
.
getMatrix
();
int
maxRowElements
=
0
;
for
(
unsigned
i
=
0
;
i
<
matrix
.
size
();
i
++
)
maxRowElements
=
max
(
maxRowElements
,
(
int
)
matrix
[
i
].
size
());
maxRowElements
++
;
// Build the list of constraints for each atom.
vector
<
vector
<
int
>
>
atomConstraints
(
context
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
atomConstraints
[
atom1
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
atomConstraints
[
atom2
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
}
int
maxAtomConstraints
=
0
;
for
(
unsigned
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
maxAtomConstraints
=
max
(
maxAtomConstraints
,
(
int
)
atomConstraints
[
i
].
size
());
// Sort the constraints.
vector
<
int
>
constraintOrder
(
numCCMA
);
...
...
@@ -858,6 +757,7 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb
vector
<
int4
>
seed
(
randomSeed
->
getSize
());
unsigned
int
r
=
randomNumberSeed
;
if
(
r
==
0
)
r
=
(
unsigned
int
)
osrngseed
();
for
(
int
i
=
0
;
i
<
randomSeed
->
getSize
();
i
++
)
{
seed
[
i
].
x
=
r
=
(
1664525
*
r
+
1013904223
)
&
0xFFFFFFFF
;
seed
[
i
].
y
=
r
=
(
1664525
*
r
+
1013904223
)
&
0xFFFFFFFF
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
61d5cc0f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-201
4
Stanford University and the Authors. *
* Portions copyright (c) 2008-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -36,6 +36,7 @@
#include "openmm/internal/CustomManyParticleForceImpl.h"
#include "openmm/internal/CustomNonbondedForceImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/internal/OSRngSeed.h"
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaIntegrationUtilities.h"
...
...
@@ -103,7 +104,7 @@ void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool
nb
.
prepareInteractions
();
}
double
CudaCalcForcesAndEnergyKernel
::
finishComputation
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
double
CudaCalcForcesAndEnergyKernel
::
finishComputation
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
int
groups
,
bool
&
valid
)
{
cu
.
getBondedUtilities
().
computeInteractions
(
groups
);
if
((
groups
&
(
1
<<
cu
.
getNonbondedUtilities
().
getForceGroup
()))
!=
0
)
cu
.
getNonbondedUtilities
().
computeInteractions
();
...
...
@@ -147,14 +148,15 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
const
vector
<
int
>&
order
=
cu
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
positions
.
resize
(
numParticles
);
double4
periodicBoxSize
=
cu
.
getPeriodicBoxSize
();
Vec3
boxVectors
[
3
];
cu
.
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
if
(
cu
.
getUseDoublePrecision
())
{
double4
*
posq
=
(
double4
*
)
cu
.
getPinnedBuffer
();
cu
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
double4
pos
=
posq
[
i
];
int4
offset
=
cu
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
pos
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
pos
.
z
-
offset
.
z
*
periodicBoxSize
.
z
)
;
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
,
pos
.
y
,
pos
.
z
)
-
boxVectors
[
0
]
*
offset
.
x
-
boxVectors
[
1
]
*
offset
.
y
-
boxVectors
[
2
]
*
offset
.
z
;
}
}
else
if
(
cu
.
getUseMixedPrecision
())
{
...
...
@@ -166,7 +168,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
float4
pos1
=
posq
[
i
];
float4
pos2
=
posCorrection
[
i
];
int4
offset
=
cu
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
((
double
)
pos1
.
x
+
(
double
)
pos2
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
(
double
)
pos1
.
y
+
(
double
)
pos2
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
(
double
)
pos1
.
z
+
(
double
)
pos2
.
z
-
offset
.
z
*
periodicBoxSize
.
z
)
;
positions
[
order
[
i
]]
=
Vec3
((
double
)
pos1
.
x
+
(
double
)
pos2
.
x
,
(
double
)
pos1
.
y
+
(
double
)
pos2
.
y
,
(
double
)
pos1
.
z
+
(
double
)
pos2
.
z
)
-
boxVectors
[
0
]
*
offset
.
x
-
boxVectors
[
1
]
*
offset
.
y
-
boxVectors
[
2
]
*
offset
.
z
;
}
}
else
{
...
...
@@ -175,7 +177,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
float4
pos
=
posq
[
i
];
int4
offset
=
cu
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
pos
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
pos
.
z
-
offset
.
z
*
periodicBoxSize
.
z
)
;
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
,
pos
.
y
,
pos
.
z
)
-
boxVectors
[
0
]
*
offset
.
x
-
boxVectors
[
1
]
*
offset
.
y
-
boxVectors
[
2
]
*
offset
.
z
;
}
}
}
...
...
@@ -304,21 +306,18 @@ void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& fo
}
void
CudaUpdateStateDataKernel
::
getPeriodicBoxVectors
(
ContextImpl
&
context
,
Vec3
&
a
,
Vec3
&
b
,
Vec3
&
c
)
const
{
double4
box
=
cu
.
getPeriodicBoxSize
();
a
=
Vec3
(
box
.
x
,
0
,
0
);
b
=
Vec3
(
0
,
box
.
y
,
0
);
c
=
Vec3
(
0
,
0
,
box
.
z
);
cu
.
getPeriodicBoxVectors
(
a
,
b
,
c
);
}
void
CudaUpdateStateDataKernel
::
setPeriodicBoxVectors
(
ContextImpl
&
context
,
const
Vec3
&
a
,
const
Vec3
&
b
,
const
Vec3
&
c
)
const
{
vector
<
CudaContext
*>&
contexts
=
cu
.
getPlatformData
().
contexts
;
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
setPeriodicBox
Size
(
a
[
0
],
b
[
1
],
c
[
2
]
);
contexts
[
i
]
->
setPeriodicBox
Vectors
(
a
,
b
,
c
);
}
void
CudaUpdateStateDataKernel
::
createCheckpoint
(
ContextImpl
&
context
,
ostream
&
stream
)
{
cu
.
setAsCurrent
();
int
version
=
1
;
int
version
=
2
;
stream
.
write
((
char
*
)
&
version
,
sizeof
(
int
));
int
precision
=
(
cu
.
getUseDoublePrecision
()
?
2
:
cu
.
getUseMixedPrecision
()
?
1
:
0
);
stream
.
write
((
char
*
)
&
precision
,
sizeof
(
int
));
...
...
@@ -339,8 +338,9 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
stream
.
write
(
buffer
,
cu
.
getVelm
().
getSize
()
*
cu
.
getVelm
().
getElementSize
());
stream
.
write
((
char
*
)
&
cu
.
getAtomIndex
()[
0
],
sizeof
(
int
)
*
cu
.
getAtomIndex
().
size
());
stream
.
write
((
char
*
)
&
cu
.
getPosCellOffsets
()[
0
],
sizeof
(
int4
)
*
cu
.
getPosCellOffsets
().
size
());
double4
box
=
cu
.
getPeriodicBoxSize
();
stream
.
write
((
char
*
)
&
box
,
sizeof
(
double4
));
Vec3
boxVectors
[
3
];
cu
.
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
stream
.
write
((
char
*
)
boxVectors
,
3
*
sizeof
(
Vec3
));
cu
.
getIntegrationUtilities
().
createCheckpoint
(
stream
);
SimTKOpenMMUtilities
::
createCheckpoint
(
stream
);
}
...
...
@@ -349,7 +349,7 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
cu
.
setAsCurrent
();
int
version
;
stream
.
read
((
char
*
)
&
version
,
sizeof
(
int
));
if
(
version
!=
1
)
if
(
version
!=
2
)
throw
OpenMMException
(
"Checkpoint was created with a different version of OpenMM"
);
int
precision
;
stream
.
read
((
char
*
)
&
precision
,
sizeof
(
int
));
...
...
@@ -379,10 +379,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
stream
.
read
((
char
*
)
&
cu
.
getAtomIndex
()[
0
],
sizeof
(
int
)
*
cu
.
getAtomIndex
().
size
());
cu
.
getAtomIndexArray
().
upload
(
cu
.
getAtomIndex
());
stream
.
read
((
char
*
)
&
cu
.
getPosCellOffsets
()[
0
],
sizeof
(
int4
)
*
cu
.
getPosCellOffsets
().
size
());
double4
box
;
stream
.
read
((
char
*
)
&
box
,
sizeof
(
double4
));
Vec3
boxVectors
[
3
]
;
stream
.
read
((
char
*
)
&
box
Vectors
,
3
*
sizeof
(
Vec3
));
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
setPeriodicBox
Size
(
box
.
x
,
box
.
y
,
box
.
z
);
contexts
[
i
]
->
setPeriodicBox
Vectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]
);
cu
.
getIntegrationUtilities
().
loadCheckpoint
(
stream
);
SimTKOpenMMUtilities
::
loadCheckpoint
(
stream
);
for
(
int
i
=
0
;
i
<
cu
.
getReorderListeners
().
size
();
i
++
)
...
...
@@ -1127,7 +1127,7 @@ void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAP
return
;
int
numMaps
=
force
.
getNumMaps
();
vector
<
float4
>
coeffVec
;
vector
<
int2
>
mapPositionsVec
(
numMaps
);
mapPositionsVec
.
resize
(
numMaps
);
vector
<
double
>
energy
;
vector
<
vector
<
double
>
>
c
;
int
currentPosition
=
0
;
...
...
@@ -1166,6 +1166,49 @@ double CudaCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includ
return
0.0
;
}
void
CudaCalcCMAPTorsionForceKernel
::
copyParametersToContext
(
ContextImpl
&
context
,
const
CMAPTorsionForce
&
force
)
{
int
numMaps
=
force
.
getNumMaps
();
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startIndex
=
cu
.
getContextIndex
()
*
force
.
getNumTorsions
()
/
numContexts
;
int
endIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
force
.
getNumTorsions
()
/
numContexts
;
numTorsions
=
endIndex
-
startIndex
;
if
(
mapPositions
->
getSize
()
!=
numMaps
)
throw
OpenMMException
(
"updateParametersInContext: The number of maps has changed"
);
if
(
torsionMaps
->
getSize
()
!=
numTorsions
)
throw
OpenMMException
(
"updateParametersInContext: The number of CMAP torsions has changed"
);
// Update the maps.
vector
<
float4
>
coeffVec
;
vector
<
double
>
energy
;
vector
<
vector
<
double
>
>
c
;
int
currentPosition
=
0
;
for
(
int
i
=
0
;
i
<
numMaps
;
i
++
)
{
int
size
;
force
.
getMapParameters
(
i
,
size
,
energy
);
if
(
size
!=
mapPositionsVec
[
i
].
y
)
throw
OpenMMException
(
"updateParametersInContext: The size of a map has changed"
);
CMAPTorsionForceImpl
::
calcMapDerivatives
(
size
,
energy
,
c
);
currentPosition
+=
4
*
size
*
size
;
for
(
int
j
=
0
;
j
<
size
*
size
;
j
++
)
{
coeffVec
.
push_back
(
make_float4
((
float
)
c
[
j
][
0
],
(
float
)
c
[
j
][
1
],
(
float
)
c
[
j
][
2
],
(
float
)
c
[
j
][
3
]));
coeffVec
.
push_back
(
make_float4
((
float
)
c
[
j
][
4
],
(
float
)
c
[
j
][
5
],
(
float
)
c
[
j
][
6
],
(
float
)
c
[
j
][
7
]));
coeffVec
.
push_back
(
make_float4
((
float
)
c
[
j
][
8
],
(
float
)
c
[
j
][
9
],
(
float
)
c
[
j
][
10
],
(
float
)
c
[
j
][
11
]));
coeffVec
.
push_back
(
make_float4
((
float
)
c
[
j
][
12
],
(
float
)
c
[
j
][
13
],
(
float
)
c
[
j
][
14
],
(
float
)
c
[
j
][
15
]));
}
}
coefficients
->
upload
(
coeffVec
);
// Update the indices.
vector
<
int
>
torsionMapsVec
(
numTorsions
);
for
(
int
i
=
0
;
i
<
numTorsions
;
i
++
)
{
int
index
[
8
];
force
.
getTorsionParameters
(
i
,
torsionMapsVec
[
i
],
index
[
0
],
index
[
1
],
index
[
2
],
index
[
3
],
index
[
4
],
index
[
5
],
index
[
6
],
index
[
7
]);
}
torsionMaps
->
upload
(
torsionMapsVec
);
}
class
CudaCustomTorsionForceInfo
:
public
CudaForceInfo
{
public:
CudaCustomTorsionForceInfo
(
const
CustomTorsionForce
&
force
)
:
force
(
force
)
{
...
...
@@ -1377,8 +1420,8 @@ public:
PmePreComputation
(
CudaContext
&
cu
,
Kernel
&
pme
,
CalcPmeReciprocalForceKernel
::
IO
&
io
)
:
cu
(
cu
),
pme
(
pme
),
io
(
io
)
{
}
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
Vec3
box
Size
(
cu
.
getPeriodicBoxSize
().
x
,
cu
.
getPeriodicBoxSize
().
y
,
cu
.
getPeriodicBoxSize
().
z
);
pme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
box
Size
,
includeEnergy
);
Vec3
box
Vectors
[
3
]
=
{
Vec3
(
cu
.
getPeriodicBoxSize
().
x
,
0
,
0
),
Vec3
(
0
,
cu
.
getPeriodicBoxSize
().
y
,
0
),
Vec3
(
0
,
0
,
cu
.
getPeriodicBoxSize
().
z
)
}
;
pme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
box
Vectors
,
includeEnergy
);
}
private:
CudaContext
&
cu
;
...
...
@@ -1398,6 +1441,38 @@ private:
CalcPmeReciprocalForceKernel
::
IO
&
io
;
};
class
CudaCalcNonbondedForceKernel
::
SyncStreamPreComputation
:
public
CudaContext
::
ForcePreComputation
{
public:
SyncStreamPreComputation
(
CudaContext
&
cu
,
CUstream
stream
,
CUevent
event
,
int
forceGroup
)
:
cu
(
cu
),
stream
(
stream
),
event
(
event
),
forceGroup
(
forceGroup
)
{
}
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
if
((
groups
&
(
1
<<
forceGroup
))
!=
0
)
{
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
cuStreamWaitEvent
(
stream
,
event
,
0
);
}
}
private:
CudaContext
&
cu
;
CUstream
stream
;
CUevent
event
;
int
forceGroup
;
};
class
CudaCalcNonbondedForceKernel
::
SyncStreamPostComputation
:
public
CudaContext
::
ForcePostComputation
{
public:
SyncStreamPostComputation
(
CudaContext
&
cu
,
CUevent
event
,
int
forceGroup
)
:
cu
(
cu
),
event
(
event
),
forceGroup
(
forceGroup
)
{
}
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
if
((
groups
&
(
1
<<
forceGroup
))
!=
0
)
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
return
0.0
;
}
private:
CudaContext
&
cu
;
CUevent
event
;
int
forceGroup
;
};
CudaCalcNonbondedForceKernel
::~
CudaCalcNonbondedForceKernel
()
{
cu
.
setAsCurrent
();
if
(
sigmaEpsilon
!=
NULL
)
...
...
@@ -1427,6 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if
(
hasInitializedFFT
)
{
cufftDestroy
(
fftForward
);
cufftDestroy
(
fftBackward
);
if
(
usePmeStream
)
{
cuStreamDestroy
(
pmeStream
);
cuEventDestroy
(
pmeSyncEvent
);
}
}
}
...
...
@@ -1636,6 +1715,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cufftSetCompatibilityMode
(
fftForward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftSetCompatibilityMode
(
fftBackward
,
CUFFT_COMPATIBILITY_NATIVE
);
// Prepare for doing PME on its own stream.
int
cufftVersion
;
cufftGetVersion
(
&
cufftVersion
);
usePmeStream
=
(
cu
.
getComputeCapability
()
<
5.0
&&
numParticles
<
130000
&&
cufftVersion
>=
6000
);
// Workarounds for various CUDA bugs
if
(
usePmeStream
)
{
cuStreamCreate
(
&
pmeStream
,
CU_STREAM_NON_BLOCKING
);
cufftSetStream
(
fftForward
,
pmeStream
);
cufftSetStream
(
fftBackward
,
pmeStream
);
CHECK_RESULT
(
cuEventCreate
(
&
pmeSyncEvent
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for NonbondedForce"
);
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
if
(
recipForceGroup
<
0
)
recipForceGroup
=
force
.
getForceGroup
();
cu
.
addPreComputation
(
new
SyncStreamPreComputation
(
cu
,
pmeStream
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
cu
,
pmeSyncEvent
,
recipForceGroup
));
}
hasInitializedFFT
=
true
;
// Initialize the b-spline moduli.
...
...
@@ -1752,13 +1847,45 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
if
(
directPmeGrid
!=
NULL
&&
includeReciprocal
)
{
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
if
(
usePmeStream
)
cu
.
setCurrentStream
(
pmeStream
);
// Invert the periodic box vectors.
Vec3
boxVectors
[
3
];
cu
.
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
double
determinant
=
boxVectors
[
0
][
0
]
*
boxVectors
[
1
][
1
]
*
boxVectors
[
2
][
2
];
double
scale
=
1.0
/
determinant
;
double3
recipBoxVectors
[
3
];
recipBoxVectors
[
0
]
=
make_double3
(
boxVectors
[
1
][
1
]
*
boxVectors
[
2
][
2
]
*
scale
,
0
,
0
);
recipBoxVectors
[
1
]
=
make_double3
(
-
boxVectors
[
1
][
0
]
*
boxVectors
[
2
][
2
]
*
scale
,
boxVectors
[
0
][
0
]
*
boxVectors
[
2
][
2
]
*
scale
,
0
);
recipBoxVectors
[
2
]
=
make_double3
((
boxVectors
[
1
][
0
]
*
boxVectors
[
2
][
1
]
-
boxVectors
[
1
][
1
]
*
boxVectors
[
2
][
0
])
*
scale
,
-
boxVectors
[
0
][
0
]
*
boxVectors
[
2
][
1
]
*
scale
,
boxVectors
[
0
][
0
]
*
boxVectors
[
1
][
1
]
*
scale
);
float3
recipBoxVectorsFloat
[
3
];
void
*
recipBoxVectorPointer
[
3
];
if
(
cu
.
getUseDoublePrecision
())
{
recipBoxVectorPointer
[
0
]
=
&
recipBoxVectors
[
0
];
recipBoxVectorPointer
[
1
]
=
&
recipBoxVectors
[
1
];
recipBoxVectorPointer
[
2
]
=
&
recipBoxVectors
[
2
];
}
else
{
recipBoxVectorsFloat
[
0
]
=
make_float3
((
float
)
recipBoxVectors
[
0
].
x
,
0
,
0
);
recipBoxVectorsFloat
[
1
]
=
make_float3
((
float
)
recipBoxVectors
[
1
].
x
,
(
float
)
recipBoxVectors
[
1
].
y
,
0
);
recipBoxVectorsFloat
[
2
]
=
make_float3
((
float
)
recipBoxVectors
[
2
].
x
,
(
float
)
recipBoxVectors
[
2
].
y
,
(
float
)
recipBoxVectors
[
2
].
z
);
recipBoxVectorPointer
[
0
]
=
&
recipBoxVectorsFloat
[
0
];
recipBoxVectorPointer
[
1
]
=
&
recipBoxVectorsFloat
[
1
];
recipBoxVectorPointer
[
2
]
=
&
recipBoxVectorsFloat
[
2
];
}
// Execute the reciprocal space kernels.
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSize
Pointer
()
,
&
pmeAtomGridIndex
->
getDevicePointer
()};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVector
Pointer
[
2
]
,
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
128
);
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getComputeCapability
()
<
2.0
)
{
...
...
@@ -1772,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cufftExecR2C
(
fftForward
,
(
float
*
)
directPmeGrid
->
getDevicePointer
(),
(
float2
*
)
reciprocalPmeGrid
->
getDevicePointer
());
if
(
includeEnergy
)
{
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeEvalEnergyKernel
,
computeEnergyArgs
,
cu
.
getNumAtoms
());
}
void
*
convolutionArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
void
*
convolutionArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
convolutionArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
...
...
@@ -1786,9 +1917,12 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSize
Pointer
()
,
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVector
Pointer
[
2
]
,
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
128
);
if
(
usePmeStream
)
{
cuEventRecord
(
pmeSyncEvent
,
pmeStream
);
cu
.
restoreDefaultStream
();
}
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
...
...
@@ -2327,6 +2461,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs
.
push_back
(
&
interactionGroupData
->
getDevicePointer
());
interactionGroupArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
interactionGroupArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
interactionGroupArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
interactionGroupArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
interactionGroupArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
interactionGroupArgs
.
push_back
(
&
params
->
getBuffers
()[
i
].
getMemory
());
if
(
globals
!=
NULL
)
...
...
@@ -2435,6 +2572,7 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
posq
.
upload
(
&
temp
[
0
]);
params
->
upload
(
paramsVector
);
prefactor
=
-
ONE_4PI_EPS0
*
((
1.0
/
force
.
getSoluteDielectric
())
-
(
1.0
/
force
.
getSolventDielectric
()));
surfaceAreaFactor
=
-
6.0
*
4
*
M_PI
*
force
.
getSurfaceAreaEnergy
();
bool
useCutoff
=
(
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
NoCutoff
);
bool
usePeriodic
=
(
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
NoCutoff
&&
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
CutoffNonPeriodic
);
string
source
=
CudaKernelSources
::
gbsaObc2
;
...
...
@@ -2461,6 +2599,7 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"CUTOFF"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"SURFACE_AREA_FACTOR"
]
=
cu
.
doubleToString
(
surfaceAreaFactor
);
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
...
...
@@ -2484,6 +2623,9 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getBlockBoundingBoxes
().
getDevicePointer
());
...
...
@@ -2503,6 +2645,9 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getBlockBoundingBoxes
().
getDevicePointer
());
...
...
@@ -2519,8 +2664,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeSumArgs
[
3
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
force1Args
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
computeSumArgs
[
1
0
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
force1Args
[
1
2
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
computeSumArgs
[
1
3
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
force1Args
[
1
5
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
}
}
cu
.
executeKernel
(
computeBornSumKernel
,
&
computeSumArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
@@ -2979,7 +3124,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
pairEnergyDefines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
anyExclusions
)
pairEnergyDefines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
atomParamSize
%
2
=
=
0
&&
!
cu
.
getUseDoublePrecision
())
if
(
atomParamSize
%
2
!
=
0
&&
!
cu
.
getUseDoublePrecision
())
pairEnergyDefines
[
"NEED_PADDING"
]
=
"1"
;
pairEnergyDefines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
pairEnergyDefines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
...
...
@@ -3289,6 +3434,9 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
pairValueArgs
.
push_back
(
&
maxTiles
);
pairValueArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getBlockBoundingBoxes
().
getDevicePointer
());
...
...
@@ -3324,6 +3472,9 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
pairEnergyArgs
.
push_back
(
&
maxTiles
);
pairEnergyArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getBlockBoundingBoxes
().
getDevicePointer
());
...
...
@@ -3389,8 +3540,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueArgs
[
4
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairEnergyArgs
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairValueArgs
[
1
1
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
pairEnergyArgs
[
1
2
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
pairValueArgs
[
1
4
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
pairEnergyArgs
[
1
5
]
=
&
nb
.
getInteractingAtoms
().
getDevicePointer
();
}
}
cu
.
executeKernel
(
pairValueKernel
,
&
pairValueArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
@@ -4006,6 +4157,9 @@ double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includ
donorArgs
.
push_back
(
&
acceptors
->
getDevicePointer
());
donorArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
donorArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
donorArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
donorArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
donorArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
if
(
globals
!=
NULL
)
donorArgs
.
push_back
(
&
globals
->
getDevicePointer
());
for
(
int
i
=
0
;
i
<
(
int
)
donorParams
->
getBuffers
().
size
();
i
++
)
{
...
...
@@ -4027,6 +4181,9 @@ double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includ
acceptorArgs
.
push_back
(
&
acceptors
->
getDevicePointer
());
acceptorArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
acceptorArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
acceptorArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
acceptorArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
acceptorArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
if
(
globals
!=
NULL
)
acceptorArgs
.
push_back
(
&
globals
->
getDevicePointer
());
for
(
int
i
=
0
;
i
<
(
int
)
donorParams
->
getBuffers
().
size
();
i
++
)
{
...
...
@@ -4629,7 +4786,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
const
vector
<
int
>&
atoms
=
iter
->
second
;
string
deltaName
=
atomNames
[
atoms
[
0
]]
+
atomNames
[
atoms
[
1
]];
if
(
computedDeltas
.
count
(
deltaName
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName
<<
" = delta("
<<
posNames
[
atoms
[
0
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName
<<
" = delta("
<<
posNames
[
atoms
[
0
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName
);
}
compute
<<
"real r_"
<<
deltaName
<<
" = sqrt(delta"
<<
deltaName
<<
".w);
\n
"
;
...
...
@@ -4643,11 +4800,11 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
string
deltaName2
=
atomNames
[
atoms
[
1
]]
+
atomNames
[
atoms
[
2
]];
string
angleName
=
"angle_"
+
atomNames
[
atoms
[
0
]]
+
atomNames
[
atoms
[
1
]]
+
atomNames
[
atoms
[
2
]];
if
(
computedDeltas
.
count
(
deltaName1
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName1
<<
" = delta("
<<
posNames
[
atoms
[
1
]]
<<
", "
<<
posNames
[
atoms
[
0
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName1
<<
" = delta("
<<
posNames
[
atoms
[
1
]]
<<
", "
<<
posNames
[
atoms
[
0
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName1
);
}
if
(
computedDeltas
.
count
(
deltaName2
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName2
<<
" = delta("
<<
posNames
[
atoms
[
1
]]
<<
", "
<<
posNames
[
atoms
[
2
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName2
<<
" = delta("
<<
posNames
[
atoms
[
1
]]
<<
", "
<<
posNames
[
atoms
[
2
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName2
);
}
compute
<<
"real "
<<
angleName
<<
" = computeAngle(delta"
<<
deltaName1
<<
", delta"
<<
deltaName2
<<
");
\n
"
;
...
...
@@ -4664,15 +4821,15 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
string
crossName2
=
"cross_"
+
deltaName2
+
"_"
+
deltaName3
;
string
dihedralName
=
"dihedral_"
+
atomNames
[
atoms
[
0
]]
+
atomNames
[
atoms
[
1
]]
+
atomNames
[
atoms
[
2
]]
+
atomNames
[
atoms
[
3
]];
if
(
computedDeltas
.
count
(
deltaName1
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName1
<<
" = delta("
<<
posNames
[
atoms
[
0
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName1
<<
" = delta("
<<
posNames
[
atoms
[
0
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName1
);
}
if
(
computedDeltas
.
count
(
deltaName2
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName2
<<
" = delta("
<<
posNames
[
atoms
[
2
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName2
<<
" = delta("
<<
posNames
[
atoms
[
2
]]
<<
", "
<<
posNames
[
atoms
[
1
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName2
);
}
if
(
computedDeltas
.
count
(
deltaName3
)
==
0
)
{
compute
<<
"real4 delta"
<<
deltaName3
<<
" = delta("
<<
posNames
[
atoms
[
2
]]
<<
", "
<<
posNames
[
atoms
[
3
]]
<<
", periodicBoxSize, invPeriodicBoxSize);
\n
"
;
compute
<<
"real4 delta"
<<
deltaName3
<<
" = delta("
<<
posNames
[
atoms
[
2
]]
<<
", "
<<
posNames
[
atoms
[
3
]]
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
);
\n
"
;
computedDeltas
.
insert
(
deltaName3
);
}
compute
<<
"real4 "
<<
crossName1
<<
" = computeCross(delta"
<<
deltaName1
<<
", delta"
<<
deltaName2
<<
");
\n
"
;
...
...
@@ -4842,7 +4999,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
if
(
!
centralParticleMode
)
{
for
(
int
i
=
1
;
i
<
particlesPerSet
;
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
particlesPerSet
;
j
++
)
verifyCutoff
<<
"includeInteraction &= (delta(pos"
<<
(
i
+
1
)
<<
", pos"
<<
(
j
+
1
)
<<
", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);
\n
"
;
verifyCutoff
<<
"includeInteraction &= (delta(pos"
<<
(
i
+
1
)
<<
", pos"
<<
(
j
+
1
)
<<
", periodicBoxSize, invPeriodicBoxSize
, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ
).w < CUTOFF_SQUARED);
\n
"
;
}
}
}
...
...
@@ -4920,6 +5077,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
forceArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
forceArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
forceArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
forceArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
if
(
nonbondedMethod
!=
NoCutoff
)
{
forceArgs
.
push_back
(
&
neighbors
->
getDevicePointer
());
forceArgs
.
push_back
(
&
neighborStartIndex
->
getDevicePointer
());
...
...
@@ -4945,6 +5105,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
blockBoundsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
...
...
@@ -4954,6 +5117,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxVecXPointer
());
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxVecYPointer
());
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxVecZPointer
());
neighborsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
...
...
@@ -5908,7 +6074,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
uniformRandoms
=
CudaArray
::
create
<
float4
>
(
cu
,
maxUniformRandoms
,
"uniformRandoms"
);
randomSeed
=
CudaArray
::
create
<
int4
>
(
cu
,
cu
.
getNumThreadBlocks
()
*
CudaContext
::
ThreadBlockSize
,
"randomSeed"
);
vector
<
int4
>
seed
(
randomSeed
->
getSize
());
unsigned
int
r
=
integrator
.
getRandomNumberSeed
()
+
1
;
int
rseed
=
integrator
.
getRandomNumberSeed
();
// A random seed of 0 means use a unique one
if
(
rseed
==
0
)
rseed
=
osrngseed
();
unsigned
int
r
=
(
unsigned
int
)
(
rseed
+
1
);
for
(
int
i
=
0
;
i
<
randomSeed
->
getSize
();
i
++
)
{
seed
[
i
].
x
=
r
=
(
1664525
*
r
+
1013904223
)
&
0xFFFFFFFF
;
seed
[
i
].
y
=
r
=
(
1664525
*
r
+
1013904223
)
&
0xFFFFFFFF
;
...
...
@@ -6358,6 +6528,7 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d
float
scalefY
=
(
float
)
scaleY
;
float
scalefZ
=
(
float
)
scaleZ
;
void
*
args
[]
=
{
&
scalefX
,
&
scalefY
,
&
scalefZ
,
&
numMolecules
,
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
moleculeAtoms
->
getDevicePointer
(),
&
moleculeStartIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
kernel
,
args
,
cu
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
(
int
)
cu
.
getPosCellOffsets
().
size
();
i
++
)
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
61d5cc0f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -62,10 +62,10 @@ private:
bool
useDouble
;
};
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
nonbondedForceGroup
(
0
)
{
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
nonbondedForceGroup
(
0
)
,
forceRebuildNeighborList
(
true
)
{
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
...
...
@@ -264,14 +264,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortedBlockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockCenter"
);
sortedBlockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockBoundingBox"
);
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
if
(
context
.
getUseDoublePrecision
())
{
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
else
{
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
...
...
@@ -304,6 +296,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs
.
push_back
(
&
numAtoms
);
findBlockBoundsArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findBlockBoundsArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
findBlockBoundsArgs
.
push_back
(
context
.
getPeriodicBoxVecXPointer
());
findBlockBoundsArgs
.
push_back
(
context
.
getPeriodicBoxVecYPointer
());
findBlockBoundsArgs
.
push_back
(
context
.
getPeriodicBoxVecZPointer
());
findBlockBoundsArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
...
...
@@ -319,9 +314,13 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
forceRebuildNeighborList
);
findInteractingBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxVecXPointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxVecYPointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxVecZPointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
...
...
@@ -342,6 +341,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
return
;
if
(
numTiles
==
0
)
return
;
if
(
usePeriodic
)
{
double4
box
=
context
.
getPeriodicBoxSize
();
double
minAllowedSize
=
1.999999
*
cutoff
;
...
...
@@ -355,6 +356,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
blockSorter
->
sort
(
*
sortedBlocks
);
context
.
executeKernel
(
sortBoxDataKernel
,
&
sortBoxDataArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
&
findInteractingBlocksArgs
[
0
],
context
.
getNumAtoms
(),
256
);
forceRebuildNeighborList
=
false
;
}
void
CudaNonbondedUtilities
::
computeInteractions
()
{
...
...
@@ -388,18 +390,11 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
3
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
14
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
4
]
=
&
interactingAtoms
->
getDevicePointer
();
if
(
context
.
getUseDoublePrecision
())
{
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
else
{
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
forceRebuildNeighborList
=
true
;
}
void
CudaNonbondedUtilities
::
setUsePadding
(
bool
padding
)
{
...
...
@@ -411,8 +406,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
startBlockIndex
=
(
int
)
(
startFraction
*
numAtomBlocks
);
numBlocks
=
(
int
)
(
endFraction
*
numAtomBlocks
)
-
startBlockIndex
;
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
startTileIndex
=
(
int
)
(
startFraction
*
totalTiles
);
;
startTileIndex
=
(
int
)
(
startFraction
*
totalTiles
);
numTiles
=
(
int
)
(
endFraction
*
totalTiles
)
-
startTileIndex
;
forceRebuildNeighborList
=
true
;
}
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
...
...
@@ -625,6 +621,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
forceArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getPeriodicBoxVecXPointer
());
forceArgs
.
push_back
(
context
.
getPeriodicBoxVecYPointer
());
forceArgs
.
push_back
(
context
.
getPeriodicBoxVecZPointer
());
forceArgs
.
push_back
(
&
maxTiles
);
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
61d5cc0f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -63,23 +63,21 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
{
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
&
numTiles
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
)
,
event
(
event
),
numTiles
(
numTiles
)
{
}
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
cu
.
setAsCurrent
();
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
)
{
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
CHECK_RESULT
(
cuMemcpyPeerAsync
(
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getContext
(),
context0
.
getPosq
().
getDevicePointer
(),
context0
.
getContext
(),
numBytes
,
0
),
"Error copying positions"
);
}
else
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
if
(
!
cu
.
getPlatformData
().
peerAccessSupported
)
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
numTiles
,
false
);
}
private:
ContextImpl
&
context
;
...
...
@@ -88,19 +86,27 @@ private:
bool
includeForce
,
includeEnergy
;
int
groups
;
void
*
pinnedMemory
;
CUevent
event
;
int
&
numTiles
;
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
)
:
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
&
numTiles
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
)
{
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
)
,
valid
(
valid
),
numTiles
(
numTiles
)
{
}
void
execute
()
{
// Execute the kernel, then download forces.
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
,
groups
,
valid
);
if
(
cu
.
getComputeForceCount
()
<
200
)
{
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
completionTime
=
getTime
();
}
if
(
includeForce
)
{
if
(
cu
.
getContextIndex
()
>
0
)
{
int
numAtoms
=
cu
.
getPaddedNumAtoms
();
...
...
@@ -108,16 +114,16 @@ public:
int
numBytes
=
numAtoms
*
3
*
sizeof
(
long
long
);
int
offset
=
(
cu
.
getContextIndex
()
-
1
)
*
numBytes
;
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
CHECK_RESULT
(
cuMemcpy
Peer
(
contextForces
.
getDevicePointer
()
+
offset
,
context0
.
getContext
(),
cu
.
getForce
().
getDevicePointer
(),
cu
.
getContext
(),
numBytes
),
"Error copying forces"
);
CHECK_RESULT
(
cuMemcpy
(
contextForces
.
getDevicePointer
()
+
offset
,
cu
.
getForce
().
getDevicePointer
(),
numBytes
),
"Error copying forces"
);
}
else
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
}
else
{
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
}
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
numTiles
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
())
{
valid
=
false
;
cu
.
getNonbondedUtilities
().
updateNeighborListSize
();
}
completionTime
=
getTime
();
}
private:
ContextImpl
&
context
;
...
...
@@ -129,11 +135,13 @@ private:
long
long
&
completionTime
;
long
long
*
pinnedMemory
;
CudaArray
&
contextForces
;
bool
&
valid
;
int
&
numTiles
;
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
tileCounts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
...
...
@@ -146,6 +154,10 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedPositionBuffer
);
if
(
pinnedForceBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
cuStreamDestroy
(
peerCopyStream
);
if
(
tileCounts
!=
NULL
)
cuMemFreeHost
(
tileCounts
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
...
@@ -153,10 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
cu
.
setAsCurrent
();
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
parallel
);
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
int
numContexts
=
data
.
contexts
.
size
();
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
CHECK_RESULT
(
cuStreamCreate
(
&
peerCopyStream
,
CU_STREAM_NON_BLOCKING
),
"Error creating stream"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
tileCounts
,
numContexts
*
sizeof
(
int
),
0
),
"Error creating tile count buffer"
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -170,27 +186,37 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel.
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
cu
.
getPlatformData
().
contexts
.
size
()
<
3
))
cu
.
getPosq
().
download
(
pinnedPositionBuffer
);
if
(
!
cu
.
getPlatformData
().
peerAccessSupported
)
{
cu
.
getPosq
().
download
(
pinnedPositionBuffer
,
false
);
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
}
else
{
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
cuStreamWaitEvent
(
peerCopyStream
,
event
,
0
);
for
(
int
i
=
1
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
CHECK_RESULT
(
cuMemcpyAsync
(
data
.
contexts
[
i
]
->
getPosq
().
getDevicePointer
(),
cu
.
getPosq
().
getDevicePointer
(),
numBytes
,
peerCopyStream
),
"Error copying positions"
);
cuEventRecord
(
event
,
peerCopyStream
);
}
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
tileCounts
[
i
]
));
}
}
double
CudaParallelCalcForcesAndEnergyKernel
::
finishComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
double
CudaParallelCalcForcesAndEnergyKernel
::
finishComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
bool
&
valid
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
tileCounts
[
i
]
));
}
data
.
syncContexts
();
double
energy
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contextEnergy
.
size
();
i
++
)
energy
+=
data
.
contextEnergy
[
i
];
if
(
includeForce
)
{
if
(
includeForce
&&
valid
)
{
// Sum the forces from all devices.
CudaContext
&
cu
=
*
data
.
contexts
[
0
];
...
...
@@ -204,6 +230,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if
(
cu
.
getComputeForceCount
()
<
200
)
{
int
firstIndex
=
0
,
lastIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
...
...
@@ -211,7 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
}
double
fractionToTransfer
=
min
(
0.
0
01
,
contextNonbondedFractions
[
lastIndex
]);
double
fractionToTransfer
=
min
(
0.01
,
contextNonbondedFractions
[
lastIndex
]);
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
double
startFraction
=
0.0
;
...
...
@@ -223,6 +250,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
startFraction
=
endFraction
;
}
}
}
return
energy
;
}
...
...
@@ -508,6 +536,11 @@ double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, boo
return
0.0
;
}
void
CudaParallelCalcCMAPTorsionForceKernel
::
copyParametersToContext
(
ContextImpl
&
context
,
const
CMAPTorsionForce
&
force
)
{
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
copyParametersToContext
(
context
,
force
);
}
class
CudaParallelCalcCustomTorsionForceKernel
::
Task
:
public
CudaContext
::
WorkTask
{
public:
Task
(
ContextImpl
&
context
,
CudaCalcCustomTorsionForceKernel
&
kernel
,
bool
includeForce
,
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
61d5cc0f
...
...
@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported
=
false
;
// Disable until I figure out why it usually makes things slower
// peerAccessSupported = true;
// for (int i = 1; i < contexts.size(); i++) {
// int canAccess;
// cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
// if (!canAccess) {
// peerAccessSupported = false;
// break;
// }
// }
if
(
peerAccessSupported
)
{
peerAccessSupported
=
true
;
for
(
int
i
=
1
;
i
<
contexts
.
size
();
i
++
)
{
contexts
[
0
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
i
]
->
getContext
(),
0
),
"Error enabling peer access"
);
contexts
[
i
]
->
setAsCurrent
();
CHECK_RESULT
(
cuCtxEnablePeerAccess
(
contexts
[
0
]
->
getContext
(),
0
),
"Error enabling peer access"
);
int
canAccess
;
cuDeviceCanAccessPeer
(
&
canAccess
,
contexts
[
i
]
->
getDevice
(),
contexts
[
0
]
->
getDevice
());
if
(
!
canAccess
)
{
peerAccessSupported
=
false
;
break
;
}
}
}
...
...
platforms/cuda/src/CudaSort.cpp
View file @
61d5cc0f
...
...
@@ -112,13 +112,12 @@ void CudaSort::sort(CudaArray& data) {
else
{
// Compute the range of data values.
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
()};
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
61d5cc0f
{
#if USE_EWALD
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
((
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
)
||
needCorrection
)
{
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
unsigned
int
includeInteraction
=
((
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
)
||
needCorrection
)
;
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
const
real
prefactor
=
138.935456
f
*
posq1
.
w
*
posq2
.
w
*
invR
;
...
...
@@ -44,16 +45,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce
+=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempEnergy
+=
ljEnergy
+
prefactor
*
erfcAlphaR
;
tempEnergy
+=
includeInteraction
?
ljEnergy
+
prefactor
*
erfcAlphaR
:
0
;
#else
tempForce
=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempEnergy
+=
prefactor
*
erfcAlphaR
;
tempEnergy
+=
includeInteraction
?
prefactor
*
erfcAlphaR
:
0
;
#endif
}
dEdR
+=
tempForce
*
invR
*
invR
;
}
dEdR
+=
includeInteraction
?
tempForce
*
invR
*
invR
:
0
;
#else
{
#ifdef USE_CUTOFF
unsigned
int
includeInteraction
=
(
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
);
#else
...
...
@@ -91,5 +90,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR
+=
includeInteraction
?
tempForce
*
invR
*
invR
:
0
;
}
#endif
}
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
61d5cc0f
...
...
@@ -2,7 +2,7 @@
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
typedef
struct
{
real
4
pos
q
;
real
3
pos
;
real3
force
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
...
...
@@ -17,7 +17,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
#else
unsigned
int
numTiles
#endif
...
...
@@ -40,7 +41,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
pos
q
1
=
posq
[
atom1
];
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
...
...
@@ -49,16 +50,14 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
pos
q
=
posq1
;
localData
[
localAtomIndex
].
pos
=
make_real3
(
pos1
.
x
,
pos1
.
y
,
pos1
.
z
)
;
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -95,7 +94,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
...
...
@@ -105,12 +105,10 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -231,7 +229,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// Load atom data for this tile.
real4
pos
q
1
=
posq
[
atom1
];
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
...
...
@@ -241,7 +239,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
...
...
@@ -252,17 +251,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
].
pos
,
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
...
...
@@ -301,12 +296,10 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
61d5cc0f
typedef
struct
{
real
4
pos
q
;
real
value
,
temp
;
real
3
pos
;
real
value
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
...
...
@@ -14,7 +14,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
#else
unsigned
int
numTiles
#endif
...
...
@@ -35,7 +36,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
pos
q
1
=
posq
[
atom1
];
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
...
...
@@ -44,16 +45,14 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
pos
q
=
posq1
;
localData
[
localAtomIndex
].
pos
=
make_real3
(
pos1
.
x
,
pos1
.
y
,
pos1
.
z
)
;
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -87,7 +86,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
#ifdef USE_EXCLUSIONS
...
...
@@ -96,12 +96,10 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -207,7 +205,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// Load atom data for this tile.
real4
pos
q
1
=
posq
[
atom1
];
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
...
...
@@ -217,7 +215,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
...
...
@@ -227,17 +226,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
].
pos
,
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -263,12 +258,10 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
localData
[
atom2
].
pos
q
;
real3
delta
=
make_real3
(
pos
q
2
.
x
-
pos
q
1
.
x
,
pos
q
2
.
y
-
pos
q
1
.
y
,
pos
q
2
.
z
-
pos
q
1
.
z
);
real
3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
platforms/cuda/src/kernels/customHbondForce.cu
View file @
61d5cc0f
...
...
@@ -25,12 +25,10 @@ inline __device__ real4 delta(real4 vec1, real4 vec2) {
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
deltaPeriodic
(
real4
vec1
,
real4
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
inline
__device__
real4
deltaPeriodic
(
real4
vec1
,
real4
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0.0
f
);
#ifdef USE_PERIODIC
result
.
x
-=
floor
(
result
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
result
.
y
-=
floor
(
result
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
result
.
z
-=
floor
(
result
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
return
result
;
...
...
@@ -69,7 +67,8 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
* Compute forces on donors.
*/
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
real
energy
=
0
;
...
...
@@ -116,7 +115,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
real4
a1
=
posBuffer
[
3
*
index
];
real4
a2
=
posBuffer
[
3
*
index
+
1
];
real4
a3
=
posBuffer
[
3
*
index
+
2
];
real4
deltaD1A1
=
deltaPeriodic
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
);
real4
deltaD1A1
=
deltaPeriodic
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CUTOFF
if
(
deltaD1A1
.
w
<
CUTOFF_SQUARED
)
{
#endif
...
...
@@ -157,7 +156,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
* Compute forces on acceptors.
*/
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
real3
f1
=
make_real3
(
0
);
...
...
@@ -203,7 +203,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
real4
d1
=
posBuffer
[
3
*
index
];
real4
d2
=
posBuffer
[
3
*
index
+
1
];
real4
d3
=
posBuffer
[
3
*
index
+
2
];
real4
deltaD1A1
=
deltaPeriodic
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
);
real4
deltaD1A1
=
deltaPeriodic
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CUTOFF
if
(
deltaD1A1
.
w
<
CUTOFF_SQUARED
)
{
#endif
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
61d5cc0f
...
...
@@ -18,12 +18,10 @@ inline __device__ real3 trim(real4 v) {
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
delta
(
real3
vec1
,
real3
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
inline
__device__
real4
delta
(
real3
vec1
,
real3
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0.0
f
);
#ifdef USE_PERIODIC
result
.
x
-=
floor
(
result
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
result
.
y
-=
floor
(
result
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
result
.
z
-=
floor
(
result
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
return
result
;
...
...
@@ -81,7 +79,7 @@ __constant__ float globals[NUM_GLOBALS];
*/
extern
"C"
__global__
void
computeInteraction
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
#endif
...
...
@@ -144,16 +142,14 @@ extern "C" __global__ void computeInteraction(
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
numNeighborPairs
)
{
extern
"C"
__global__
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
numNeighborPairs
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
base
];
#ifdef USE_PERIODIC
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS
(
pos
)
#endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
...
...
@@ -162,9 +158,7 @@ extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPerio
pos
=
posq
[
i
];
#ifdef USE_PERIODIC
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
pos
.
x
-=
floor
((
pos
.
x
-
center
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
center
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
center
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
,
center
)
#endif
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
...
...
@@ -182,8 +176,8 @@ extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPerio
/**
* Find a list of neighbors for each atom.
*/
extern
"C"
__global__
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
int2
*
__restrict__
neighborPairs
,
extern
"C"
__global__
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
numNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
int
maxNeighborPairs
#ifdef USE_EXCLUSIONS
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
...
...
@@ -216,9 +210,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
real4
blockSize2
=
blockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenter1
-
blockCenter2
;
#ifdef USE_PERIODIC
blockDelta
.
x
-=
floor
(
blockDelta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
blockDelta
.
y
-=
floor
(
blockDelta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
blockDelta
.
z
-=
floor
(
blockDelta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
#endif
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSize1
.
x
-
blockSize2
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSize1
.
y
-
blockSize2
.
y
);
...
...
@@ -247,7 +239,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
// Decide whether to include this atom pair in the neighbor list.
real4
atomDelta
=
delta
(
pos1
,
pos2
,
periodicBoxSize
,
invPeriodicBoxSize
);
real4
atomDelta
=
delta
(
pos1
,
pos2
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CENTRAL_PARTICLE
bool
includeAtom
=
(
atom2
!=
atom1
&&
atom2
<
NUM_ATOMS
&&
atomDelta
.
w
<
CUTOFF_SQUARED
);
#else
...
...
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
61d5cc0f
...
...
@@ -10,7 +10,7 @@ typedef struct {
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
...
...
@@ -47,9 +47,7 @@ extern "C" __global__ void computeInteractionGroups(
posq2
=
make_real4
(
localData
[
localIndex
].
x
,
localData
[
localIndex
].
y
,
localData
[
localIndex
].
z
,
localData
[
localIndex
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
61d5cc0f
...
...
@@ -4,16 +4,15 @@
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
rebuildNeighborList
,
real2
*
__restrict__
sortedBlocks
)
{
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
rebuildNeighborList
,
real2
*
__restrict__
sortedBlocks
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
numAtoms
)
{
real4
pos
=
posq
[
base
];
#ifdef USE_PERIODIC
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS
(
pos
)
#endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
...
...
@@ -22,9 +21,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
pos
=
posq
[
i
];
#ifdef USE_PERIODIC
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
pos
.
x
-=
floor
((
pos
.
x
-
center
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
center
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
center
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
,
center
)
#endif
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
...
...
@@ -46,7 +43,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
)
{
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
,
bool
forceRebuild
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
...
...
@@ -55,7 +52,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool
rebuild
=
f
alse
;
bool
rebuild
=
f
orceRebuild
;
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
delta
=
oldPositions
[
i
]
-
posq
[
i
];
if
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
>
0.25
f
*
PADDING
*
PADDING
)
...
...
@@ -116,11 +113,11 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
* [in] rebuildNeighbourList - whether or not to execute this kernel
*
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
if
(
rebuildNeighborList
[
0
]
==
0
)
return
;
// The neighbor list doesn't need to be rebuilt.
...
...
@@ -157,9 +154,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos1
.
x
-=
floor
((
pos1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos1
.
y
-=
floor
((
pos1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos1
.
z
-=
floor
((
pos1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
}
#endif
posBuffer
[
threadIdx
.
x
]
=
pos1
;
...
...
@@ -185,9 +180,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
real4
blockSizeY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockBoundingBox
[
block2
]
:
make_real4
(
0
));
real4
blockDelta
=
blockCenterX
-
blockCenterY
;
#ifdef USE_PERIODIC
blockDelta
.
x
-=
floor
(
blockDelta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
blockDelta
.
y
-=
floor
(
blockDelta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
blockDelta
.
z
-=
floor
(
blockDelta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
#endif
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
...
...
@@ -215,9 +208,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
real3
pos2
=
trimTo3
(
posq
[
atom2
]);
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
pos2
.
x
-=
floor
((
pos2
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos2
.
y
-=
floor
((
pos2
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos2
.
z
-=
floor
((
pos2
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos2
,
blockCenterX
)
}
#endif
bool
interacts
=
false
;
...
...
@@ -226,9 +217,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if
(
!
singlePeriodicCopy
)
{
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
}
}
...
...
@@ -256,7 +245,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<
maxTiles
)
{
if
(
newTileStartIndex
+
tilesToStore
<
=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
...
...
@@ -275,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<
maxTiles
)
{
if
(
newTileStartIndex
+
tilesToStore
<
=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
...
...
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
61d5cc0f
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
...
...
@@ -70,7 +69,8 @@ typedef struct {
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
float2
*
__restrict__
global_params
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
...
...
@@ -105,9 +105,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -152,9 +150,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -292,12 +288,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
...
...
@@ -343,9 +335,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
...
...
@@ -415,7 +405,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
...
...
@@ -452,9 +443,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -509,9 +498,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
@@ -657,12 +644,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
...
...
@@ -714,9 +697,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
...
...
platforms/cuda/src/kernels/langevin.cu
View file @
61d5cc0f
...
...
@@ -90,7 +90,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
while
(
index
<
numAtoms
)
{
mixed3
f
=
make_mixed3
(
scale
*
force
[
index
],
scale
*
force
[
index
+
paddedNumAtoms
],
scale
*
force
[
index
+
paddedNumAtoms
*
2
]);
mixed
invMass
=
velm
[
index
].
w
;
err
+=
(
f
.
x
*
f
.
x
+
f
.
y
*
f
.
y
+
f
.
z
*
f
.
z
)
*
invMass
;
err
+=
(
f
.
x
*
f
.
x
+
f
.
y
*
f
.
y
+
f
.
z
*
f
.
z
)
*
invMass
*
invMass
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
error
[
threadIdx
.
x
]
=
err
;
...
...
platforms/cuda/src/kernels/monteCarloBarostat.cu
View file @
61d5cc0f
...
...
@@ -2,7 +2,8 @@
* Scale the particle positions with each axis independent
*/
extern
"C"
__global__
void
scalePositions
(
float
scaleX
,
float
scaleY
,
float
scaleZ
,
int
numMolecules
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
*
__restrict__
posq
,
extern
"C"
__global__
void
scalePositions
(
float
scaleX
,
float
scaleY
,
float
scaleZ
,
int
numMolecules
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
*
__restrict__
posq
,
const
int
*
__restrict__
moleculeAtoms
,
const
int
*
__restrict__
moleculeStartIndex
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numMolecules
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
first
=
moleculeStartIndex
[
index
];
...
...
@@ -25,13 +26,9 @@ extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scal
// Move it into the first periodic box.
int
xcell
=
(
int
)
floor
(
center
.
x
*
invPeriodicBoxSize
.
x
);
int
ycell
=
(
int
)
floor
(
center
.
y
*
invPeriodicBoxSize
.
y
);
int
zcell
=
(
int
)
floor
(
center
.
z
*
invPeriodicBoxSize
.
z
);
real3
delta
=
make_real3
(
xcell
*
periodicBoxSize
.
x
,
ycell
*
periodicBoxSize
.
y
,
zcell
*
periodicBoxSize
.
z
);
center
.
x
-=
delta
.
x
;
center
.
y
-=
delta
.
y
;
center
.
z
-=
delta
.
z
;
real3
oldCenter
=
center
;
APPLY_PERIODIC_TO_POS
(
center
)
real3
delta
=
make_real3
(
oldCenter
.
x
-
center
.
x
,
oldCenter
.
y
-
center
.
y
,
oldCenter
.
z
-
center
.
z
);
// Now scale the position of the molecule center.
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
61d5cc0f
...
...
@@ -103,8 +103,9 @@ extern "C" __global__ void computeNonbonded(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
tileflags
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
...
@@ -155,9 +156,7 @@ extern "C" __global__ void computeNonbonded(
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
...
...
@@ -223,14 +222,9 @@ extern "C" __global__ void computeNonbonded(
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
...
...
@@ -276,9 +270,6 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
...
...
@@ -404,22 +395,25 @@ extern "C" __global__ void computeNonbonded(
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
else
{
#ifdef ENABLE_SHUFFLE
shflPosq
=
make_real4
(
0
,
0
,
0
,
0
);
#else
localData
[
threadIdx
.
x
].
x
=
0
;
localData
[
threadIdx
.
x
].
y
=
0
;
localData
[
threadIdx
.
x
].
z
=
0
;
#endif
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
#ifdef ENABLE_SHUFFLE
shflPosq
.
x
-=
floor
((
shflPosq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
shflPosq
.
y
-=
floor
((
shflPosq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
shflPosq
.
z
-=
floor
((
shflPosq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
shflPosq
,
blockCenterX
)
#else
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
#endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
...
...
@@ -431,7 +425,6 @@ extern "C" __global__ void computeNonbonded(
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
...
...
@@ -477,7 +470,6 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
...
...
@@ -498,14 +490,9 @@ extern "C" __global__ void computeNonbonded(
#endif
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
...
...
@@ -551,9 +538,6 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
...
...
Prev
1
…
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment