Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2d2f05ce
Commit
2d2f05ce
authored
Dec 08, 2015
by
Andy Simmonett
Browse files
Merge branch 'master' of github.com:pandegroup/openmm into genpt
parents
94823d84
4d32047c
Changes
235
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
263 additions
and
198 deletions
+263
-198
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+40
-23
platforms/cpu/src/CpuSETTLE.cpp
platforms/cpu/src/CpuSETTLE.cpp
+20
-9
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
+5
-22
platforms/cuda/CMakeLists.txt
platforms/cuda/CMakeLists.txt
+2
-2
platforms/cuda/include/CudaContext.h
platforms/cuda/include/CudaContext.h
+6
-0
platforms/cuda/include/CudaIntegrationUtilities.h
platforms/cuda/include/CudaIntegrationUtilities.h
+9
-0
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+3
-3
platforms/cuda/include/CudaPlatform.h
platforms/cuda/include/CudaPlatform.h
+1
-1
platforms/cuda/src/CudaBondedUtilities.cpp
platforms/cuda/src/CudaBondedUtilities.cpp
+2
-2
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+96
-57
platforms/cuda/src/CudaExpressionUtilities.cpp
platforms/cuda/src/CudaExpressionUtilities.cpp
+8
-7
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+27
-4
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+27
-54
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+4
-1
platforms/cuda/src/kernels/customCentroidBond.cu
platforms/cuda/src/kernels/customCentroidBond.cu
+2
-2
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+2
-2
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+2
-2
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+3
-3
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+2
-2
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+2
-2
No files found.
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
2d2f05ce
...
...
@@ -28,7 +28,7 @@
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "gmx_atomic.h"
#include "
openmm/internal/
gmx_atomic.h"
#include <algorithm>
// In case we're using some primitive version of Visual Studio this will
...
...
@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
threads
.
execute
(
task
);
threads
.
waitForThreads
();
// Signal the threads to subtract the exclusions.
if
(
ewald
||
pme
)
{
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
waitForThreads
();
}
// Combine the energies from all the threads.
if
(
totalEnergy
!=
NULL
)
{
...
...
@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
for
(
int
i
=
threadIndex
;
i
<
numberOfAtoms
;
i
+=
numThreads
)
{
fvec4
posI
((
float
)
atomCoordinates
[
i
][
0
],
(
float
)
atomCoordinates
[
i
][
1
],
(
float
)
atomCoordinates
[
i
][
2
],
0.0
f
);
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
if
(
*
iter
>
i
)
{
int
j
=
*
iter
;
fvec4
deltaR
;
fvec4
posJ
((
float
)
atomCoordinates
[
j
][
0
],
(
float
)
atomCoordinates
[
j
][
1
],
(
float
)
atomCoordinates
[
j
][
2
],
0.0
f
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
float
r
=
sqrtf
(
r2
);
float
inverseR
=
1
/
r
;
float
chargeProd
=
ONE_4PI_EPS0
*
posq
[
4
*
i
+
3
]
*
posq
[
4
*
j
+
3
];
float
alphaR
=
alphaEwald
*
r
;
float
erfAlphaR
=
erf
(
alphaR
);
if
(
erfAlphaR
>
1e-6
f
)
{
float
dEdR
=
(
float
)
(
chargeProd
*
inverseR
*
inverseR
*
inverseR
);
dEdR
=
(
float
)
(
dEdR
*
(
erfAlphaR
-
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
)));
fvec4
result
=
deltaR
*
dEdR
;
(
fvec4
(
forces
+
4
*
i
)
-
result
).
store
(
forces
+
4
*
i
);
(
fvec4
(
forces
+
4
*
j
)
+
result
).
store
(
forces
+
4
*
j
);
if
(
includeEnergy
)
threadEnergy
[
threadIndex
]
-=
chargeProd
*
inverseR
*
erfAlphaR
;
threads
.
syncThreads
();
const
int
groupSize
=
max
(
1
,
numberOfAtoms
/
(
10
*
numThreads
));
while
(
true
)
{
int
start
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
groupSize
);
if
(
start
>=
numberOfAtoms
)
break
;
int
end
=
min
(
start
+
groupSize
,
numberOfAtoms
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
fvec4
posI
((
float
)
atomCoordinates
[
i
][
0
],
(
float
)
atomCoordinates
[
i
][
1
],
(
float
)
atomCoordinates
[
i
][
2
],
0.0
f
);
float
scaledChargeI
=
(
float
)
(
ONE_4PI_EPS0
*
posq
[
4
*
i
+
3
]);
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
if
(
*
iter
>
i
)
{
int
j
=
*
iter
;
fvec4
deltaR
;
fvec4
posJ
((
float
)
atomCoordinates
[
j
][
0
],
(
float
)
atomCoordinates
[
j
][
1
],
(
float
)
atomCoordinates
[
j
][
2
],
0.0
f
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
float
r
=
sqrtf
(
r2
);
float
alphaR
=
alphaEwald
*
r
;
float
erfAlphaR
=
erf
(
alphaR
);
if
(
erfAlphaR
>
1e-6
f
)
{
float
inverseR
=
1
/
r
;
float
chargeProdOverR
=
scaledChargeI
*
posq
[
4
*
j
+
3
]
*
inverseR
;
float
dEdR
=
chargeProdOverR
*
inverseR
*
inverseR
;
dEdR
=
dEdR
*
(
erfAlphaR
-
(
float
)
TWO_OVER_SQRT_PI
*
alphaR
*
(
float
)
exp
(
-
alphaR
*
alphaR
));
fvec4
result
=
deltaR
*
dEdR
;
(
fvec4
(
forces
+
4
*
i
)
-
result
).
store
(
forces
+
4
*
i
);
(
fvec4
(
forces
+
4
*
j
)
+
result
).
store
(
forces
+
4
*
j
);
if
(
includeEnergy
)
threadEnergy
[
threadIndex
]
-=
chargeProdOverR
*
erfAlphaR
;
}
}
}
}
...
...
platforms/cpu/src/CpuSETTLE.cpp
View file @
2d2f05ce
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors.
*
* Portions copyright (c) 2013
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
#include "openmm/internal/gmx_atomic.h"
using
namespace
OpenMM
;
using
namespace
std
;
...
...
@@ -39,10 +40,14 @@ public:
ApplyToPositionsTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
atomCoordinatesP
(
atomCoordinatesP
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
gmx_atomic_set
(
&
atomicCounter
,
0
);
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
if
(
threadIndex
<
threadSettle
.
size
())
{
threadSettle
[
threadIndex
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
while
(
true
)
{
int
index
=
gmx_atomic_fetch_add
(
&
atomicCounter
,
1
);
if
(
index
>=
threadSettle
.
size
())
break
;
threadSettle
[
index
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
}
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
...
...
@@ -50,6 +55,7 @@ public:
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
gmx_atomic_t
atomicCounter
;
};
class
CpuSETTLE
::
ApplyToVelocitiesTask
:
public
ThreadPool
::
Task
{
...
...
@@ -57,10 +63,14 @@ public:
ApplyToVelocitiesTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
velocities
(
velocities
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
gmx_atomic_set
(
&
atomicCounter
,
0
);
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
if
(
threadIndex
<
threadSettle
.
size
())
{
threadSettle
[
threadIndex
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
while
(
true
)
{
int
index
=
gmx_atomic_fetch_add
(
&
atomicCounter
,
1
);
if
(
index
>=
threadSettle
.
size
())
break
;
threadSettle
[
index
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
}
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
...
...
@@ -68,17 +78,18 @@ public:
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
gmx_atomic_t
atomicCounter
;
};
CpuSETTLE
::
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
)
:
threads
(
threads
)
{
int
num
Thread
s
=
threads
.
getNumThreads
();
int
num
Block
s
=
10
*
threads
.
getNumThreads
();
int
numClusters
=
settle
.
getNumClusters
();
vector
<
RealOpenMM
>
mass
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
mass
[
i
]
=
system
.
getParticleMass
(
i
);
for
(
int
i
=
0
;
i
<
num
Thread
s
;
i
++
)
{
int
start
=
i
*
numClusters
/
num
Thread
s
;
int
end
=
(
i
+
1
)
*
numClusters
/
num
Thread
s
;
for
(
int
i
=
0
;
i
<
num
Block
s
;
i
++
)
{
int
start
=
i
*
numClusters
/
num
Block
s
;
int
end
=
(
i
+
1
)
*
numClusters
/
num
Block
s
;
if
(
start
!=
end
)
{
int
numThreadClusters
=
end
-
start
;
vector
<
int
>
atom1
(
numThreadClusters
),
atom2
(
numThreadClusters
),
atom3
(
numThreadClusters
);
...
...
serialization/include/openmm/serialization/GBVIForceProxy.h
→
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
View file @
2d2f05ce
#ifndef OPENMM_GBVIFORCE_PROXY_H_
#define OPENMM_GBVIFORCE_PROXY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
...
...
@@ -9,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 201
0
Stanford University and the Authors. *
* Portions copyright (c) 201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -32,22 +29,8 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include "openmm/serialization/SerializationProxy.h"
namespace
OpenMM
{
/**
* This is a proxy for serializing GBVIForce objects.
*/
class
OPENMM_EXPORT
GBVIForceProxy
:
public
SerializationProxy
{
public:
GBVIForceProxy
();
void
serialize
(
const
void
*
object
,
SerializationNode
&
node
)
const
;
void
*
deserialize
(
const
SerializationNode
&
node
)
const
;
};
}
// namespace OpenMM
#include "CpuTests.h"
#include "TestCompoundIntegrator.h"
#endif
/*OPENMM_GBVIFORCE_PROXY_H_*/
void
runPlatformTests
()
{
}
platforms/cuda/CMakeLists.txt
View file @
2d2f05ce
...
...
@@ -13,9 +13,9 @@
#----------------------------------------------------
set
(
OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL
"Whether to build CUDA test cases"
)
if
(
OPENMM_BUILD_CUDA_TESTS
)
if
(
BUILD_TESTING AND
OPENMM_BUILD_CUDA_TESTS
)
SUBDIRS
(
tests
)
endif
(
OPENMM_BUILD_CUDA_TESTS
)
endif
(
BUILD_TESTING AND
OPENMM_BUILD_CUDA_TESTS
)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
...
...
platforms/cuda/include/CudaContext.h
View file @
2d2f05ce
...
...
@@ -30,6 +30,7 @@
#include <map>
#include <queue>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
...
...
@@ -538,6 +539,11 @@ public:
*/
void
invalidateMolecules
();
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std
::
vector
<
int
>
getDevicePrecedence
();
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
...
...
platforms/cuda/include/CudaIntegrationUtilities.h
View file @
2d2f05ce
...
...
@@ -62,6 +62,14 @@ public:
CudaArray
&
getStepSize
()
{
return
*
stepSize
;
}
/**
* Set the size to use for the next step.
*/
void
setNextStepSize
(
double
size
);
/**
* Get the size that was used for the last step.
*/
double
getLastStepSize
();
/**
* Apply constraints to the atom positions.
*
...
...
@@ -154,6 +162,7 @@ private:
CudaArray
*
vsiteLocalCoordsParams
;
int
randomPos
;
int
lastSeed
,
numVsites
;
double2
lastStepSize
;
struct
ShakeCluster
;
struct
ConstraintOrderer
;
};
...
...
platforms/cuda/include/CudaKernels.h
View file @
2d2f05ce
...
...
@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
pmeEnergyBuffer
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
}
~
CudaCalcNonbondedForceKernel
();
/**
...
...
@@ -657,6 +657,7 @@ private:
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeEnergyBuffer
;
CudaSort
*
sort
;
Kernel
cpuPme
;
PmeIO
*
pmeio
;
...
...
@@ -1123,7 +1124,6 @@ public:
double
computeKineticEnergy
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
);
private:
CudaContext
&
cu
;
double
prevStepSize
;
CUfunction
kernel1
,
kernel2
;
};
...
...
@@ -1354,7 +1354,7 @@ private:
void
recordChangedParameters
(
ContextImpl
&
context
);
bool
evaluateCondition
(
int
step
);
CudaContext
&
cu
;
double
prevStepSize
,
energy
;
double
energy
;
float
energyFloat
;
int
numGlobalVariables
;
bool
hasInitializedKernels
,
deviceValuesAreCurrent
,
deviceGlobalsAreCurrent
,
modifiesParameters
,
keNeedsForce
,
hasAnyConstraints
;
...
...
platforms/cuda/include/CudaPlatform.h
View file @
2d2f05ce
...
...
@@ -121,7 +121,7 @@ public:
ContextImpl
*
context
;
std
::
vector
<
CudaContext
*>
contexts
;
std
::
vector
<
double
>
contextEnergy
;
bool
removeCM
,
peerAccessSupported
,
useCpuPme
;
bool
hasInitializedContexts
,
removeCM
,
peerAccessSupported
,
useCpuPme
;
int
cmMotionFrequency
;
int
stepCount
,
computeForceCount
;
double
time
;
...
...
platforms/cuda/src/CudaBondedUtilities.cpp
View file @
2d2f05ce
...
...
@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s
<<
CudaKernelSources
::
vectorOps
;
for
(
int
i
=
0
;
i
<
(
int
)
prefixCode
.
size
();
i
++
)
s
<<
prefixCode
[
i
];
s
<<
"extern
\"
C
\"
__global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer,
real
* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"
;
s
<<
"extern
\"
C
\"
__global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer,
mixed
* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"
;
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
atomIndices
[
force
].
size
();
i
++
)
{
int
indexWidth
=
atomIndices
[
force
][
i
]
->
getElementSize
()
/
4
;
...
...
@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
for
(
int
i
=
0
;
i
<
(
int
)
arguments
.
size
();
i
++
)
s
<<
", "
<<
argTypes
[
i
]
<<
"* customArg"
<<
(
i
+
1
);
s
<<
") {
\n
"
;
s
<<
"
real
energy = 0;
\n
"
;
s
<<
"
mixed
energy = 0;
\n
"
;
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
s
<<
createForceSource
(
force
,
forceAtoms
[
force
].
size
(),
forceAtoms
[
force
][
0
].
size
(),
forceGroup
[
force
],
forceSource
[
force
]);
s
<<
"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
\n
"
;
...
...
platforms/cuda/src/CudaContext.cpp
View file @
2d2f05ce
...
...
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
if
(
deviceIndex
<
-
1
||
deviceIndex
>=
numDevices
)
throw
OpenMMException
(
"Illegal value for CudaDeviceIndex: "
+
intToString
(
deviceIndex
));
vector
<
int
>
devicePrecedence
;
if
(
deviceIndex
==
-
1
)
{
// Try to figure out which device is the fastest.
int
bestSpeed
=
-
1
;
int
bestCompute
=
-
1
;
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
i
));
int
major
,
minor
,
clock
,
multiprocessors
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
if
(
major
==
1
&&
minor
<
2
)
continue
;
// 1.0 and 1.1 are not supported
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
device
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
int
speed
=
clock
*
multiprocessors
;
if
(
major
>
bestCompute
||
(
major
==
bestCompute
&&
speed
>
bestSpeed
))
{
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestCompute
=
major
;
}
devicePrecedence
=
getDevicePrecedence
();
}
else
{
devicePrecedence
.
push_back
(
deviceIndex
);
}
this
->
deviceIndex
=
-
1
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devicePrecedence
.
size
());
i
++
)
{
int
trialDeviceIndex
=
devicePrecedence
[
i
];
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
trialDeviceIndex
));
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
if
(
cuCtxCreate
(
&
context
,
flags
,
device
)
==
CUDA_SUCCESS
)
{
this
->
deviceIndex
=
trialDeviceIndex
;
break
;
}
}
if
(
deviceIndex
==
-
1
)
throw
OpenMMException
(
"No compatible CUDA device is available"
);
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
this
->
deviceIndex
=
deviceIndex
;
if
(
this
->
deviceIndex
==
-
1
)
if
(
deviceIndex
!=
-
1
)
throw
OpenMMException
(
"The requested CUDA device could not be loaded"
);
else
throw
OpenMMException
(
"No compatible CUDA device is available"
);
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if
(
major
==
5
)
minor
=
0
;
#if __CUDA_API_VERSION < 7000
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// anything beyond 5.0.
if
(
major
==
5
)
minor
=
0
;
#endif
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
computeCapability
=
major
+
0.1
*
minor
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
computeCapability
<
1.3
)
throw
OpenMMException
(
"This device does not support double precision"
);
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
...
...
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines
[
"ATAN"
]
=
useDoublePrecision
?
"atan"
:
"atanf"
;
compilationDefines
[
"ERF"
]
=
useDoublePrecision
?
"erf"
:
"erff"
;
compilationDefines
[
"ERFC"
]
=
useDoublePrecision
?
"erfc"
:
"erfcf"
;
// Set defines for applying periodic boundary conditions.
Vec3
boxVectors
[
3
];
system
.
getDefaultPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
boxIsTriclinic
=
(
boxVectors
[
0
][
1
]
!=
0.0
||
boxVectors
[
0
][
2
]
!=
0.0
||
...
...
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
// Create the work thread used for parallelization when running on multiple devices.
thread
=
new
WorkThread
();
// Create utilities objects.
bonded
=
new
CudaBondedUtilities
(
*
this
);
nonbonded
=
new
CudaNonbondedUtilities
(
*
this
);
integration
=
new
CudaIntegrationUtilities
(
*
this
,
system
);
...
...
@@ -368,7 +367,7 @@ void CudaContext::initialize() {
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
else
if
(
useMixedPrecision
)
{
energyBuffer
=
CudaArray
::
create
<
float
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
energyBuffer
=
CudaArray
::
create
<
double
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
4
,
numEnergyBuffers
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
...
...
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if
(
index
!=
result
.
npos
)
{
if
((
index
==
0
||
symbolChars
.
find
(
result
[
index
-
1
])
==
symbolChars
.
end
())
&&
(
index
==
result
.
size
()
-
size
||
symbolChars
.
find
(
result
[
index
+
size
])
==
symbolChars
.
end
()))
{
// We have found a complete symbol, not part of a longer symbol.
result
.
replace
(
index
,
size
,
iter
->
second
);
index
+=
iter
->
second
.
size
();
}
...
...
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return
-
1
;
}
WaitForSingleObject
(
pi
.
hProcess
,
INFINITE
);
DWORD
exitCode
=
-
1
;
DWORD
exitCode
=
-
1
;
if
(
!
GetExitCodeProcess
(
pi
.
hProcess
,
&
exitCode
))
{
throw
(
OpenMMException
(
"Could not get nvcc.exe's exit code
\n
"
));
}
else
{
if
(
exitCode
==
0
)
if
(
exitCode
==
0
)
return
0
;
else
return
-
1
;
...
...
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if
(
!
defines
.
empty
())
src
<<
endl
;
src
<<
source
<<
endl
;
// See whether we already have PTX for this kernel cached.
CSHA1
sha1
;
sha1
.
Update
((
const
UINT_8
*
)
src
.
str
().
c_str
(),
src
.
str
().
size
());
sha1
.
Final
();
...
...
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule
module
;
if
(
cuModuleLoad
(
&
module
,
cacheFile
.
str
().
c_str
())
==
CUDA_SUCCESS
)
return
module
;
// Select names for the various temporary files.
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
...
...
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int
res
=
0
;
// If the runtime compiler plugin is available, use it.
if
(
hasCompilerKernel
)
{
string
ptx
=
compilerKernel
.
getAs
<
CudaCompilerKernel
>
().
createModule
(
src
.
str
(),
"-arch=compute_"
+
gpuArchitecture
+
" "
+
options
,
*
this
);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool
wroteCache
=
false
;
try
{
ofstream
out
(
outputFile
.
c_str
());
...
...
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
if
(
!
wroteCache
)
{
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2
(
cuModuleLoadDataEx
(
&
module
,
&
ptx
[
0
],
0
,
NULL
,
NULL
),
"Error loading CUDA module"
);
return
module
;
}
...
...
@@ -883,7 +882,7 @@ private:
void
CudaContext
::
findMoleculeGroups
()
{
// The first time this is called, we need to identify all the molecules in the system.
if
(
moleculeGroups
.
size
()
==
0
)
{
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...
...
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
identical
=
false
;
}
// See if the constraints are identical.
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
...
...
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
}
if
(
valid
)
return
;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector
<
int4
>
newCellOffsets
(
numAtoms
);
if
(
useDoublePrecision
)
{
vector
<
double4
>
oldPosq
(
paddedNumAtoms
);
...
...
@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
molPos
[
i
].
x
*=
invNumAtoms
;
molPos
[
i
].
y
*=
invNumAtoms
;
molPos
[
i
].
z
*=
invNumAtoms
;
if
(
molPos
[
i
].
x
!=
molPos
[
i
].
x
)
throw
OpenMMException
(
"Particle coordinate is nan"
);
}
if
(
nonbonded
->
getUsePeriodic
())
{
// Move each molecule position into the same box.
...
...
@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait
(
&
queueEmptyCondition
,
&
queueLock
);
pthread_mutex_unlock
(
&
queueLock
);
}
vector
<
int
>
CudaContext
::
getDevicePrecedence
()
{
int
numDevices
;
CUdevice
thisDevice
;
string
errorMessage
=
"Error initializing Context"
;
vector
<
pair
<
pair
<
int
,
int
>
,
int
>
>
devices
;
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
thisDevice
,
i
));
int
major
,
minor
,
clock
,
multiprocessors
,
speed
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
thisDevice
));
if
(
major
==
1
&&
minor
<
2
)
continue
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
(
major
+
0.1
*
minor
<
1.3
))
continue
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
thisDevice
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
thisDevice
));
speed
=
clock
*
multiprocessors
;
pair
<
int
,
int
>
deviceProperties
=
std
::
make_pair
(
major
,
speed
);
devices
.
push_back
(
std
::
make_pair
(
deviceProperties
,
-
i
));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std
::
sort
(
devices
.
begin
(),
devices
.
end
());
std
::
reverse
(
devices
.
begin
(),
devices
.
end
());
vector
<
int
>
precedence
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devices
.
size
());
i
++
)
{
precedence
.
push_back
(
-
devices
[
i
].
second
);
}
return
precedence
;
}
platforms/cuda/src/CudaExpressionUtilities.cpp
View file @
2d2f05ce
...
...
@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
}
out
<<
");
\n
"
;
out
<<
"APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)
\n
"
;
out
<<
tempType
<<
" periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);
\n
"
;
out
<<
tempType
<<
" periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;
\n
"
;
out
<<
tempType
<<
" periodicDistance_rinv = RSQRT(periodicDistance_r2);
\n
"
;
for
(
int
j
=
0
;
j
<
nodes
.
size
();
j
++
)
{
const
vector
<
int
>&
derivOrder
=
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
nodes
[
j
]
->
getOperation
())
->
getDerivOrder
();
int
argIndex
=
-
1
;
...
...
@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if
(
argIndex
==
-
1
)
out
<<
nodeNames
[
j
]
<<
" = RECIP(periodicDistance_rinv);
\n
"
;
else
if
(
argIndex
==
0
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.x*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.x*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
1
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.y*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.y*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
2
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.z*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.z*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
3
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.x*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.x*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
4
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.y*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.y*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
5
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.z*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.z*periodicDistance_rinv
: 0)
;
\n
"
;
}
}
else
{
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
2d2f05ce
...
...
@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
),
vsiteLocalCoordsAtoms
(
NULL
),
vsiteLocalCoordsParams
(
NULL
)
{
// Create workspace arrays.
lastStepSize
=
make_double2
(
0.0
,
0.0
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
{
posDelta
=
CudaArray
::
create
<
double4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
double4
>
deltas
(
posDelta
->
getSize
(),
make_double4
(
0.0
,
0.0
,
0.0
,
0.0
));
posDelta
->
upload
(
deltas
);
stepSize
=
CudaArray
::
create
<
double2
>
(
context
,
1
,
"stepSize"
);
vector
<
double2
>
step
(
1
,
make_double2
(
0.0
,
0.0
));
stepSize
->
upload
(
step
);
stepSize
->
upload
(
&
lastStepSize
);
}
else
{
posDelta
=
CudaArray
::
create
<
float4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
float4
>
deltas
(
posDelta
->
getSize
(),
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
));
posDelta
->
upload
(
deltas
);
stepSize
=
CudaArray
::
create
<
float2
>
(
context
,
1
,
"stepSize"
);
vector
<
float2
>
step
(
1
,
make_float2
(
0.0
f
,
0.0
f
)
)
;
stepSize
->
upload
(
step
);
float2
lastStepSizeFloat
=
make_float2
(
0.0
f
,
0.0
f
);
stepSize
->
upload
(
&
lastStepSizeFloat
);
}
// Record the set of constraints and how many constraints each atom is involved in.
...
...
@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
vsiteLocalCoordsParams
;
}
void
CudaIntegrationUtilities
::
setNextStepSize
(
double
size
)
{
if
(
size
!=
lastStepSize
.
x
||
size
!=
lastStepSize
.
y
)
{
lastStepSize
=
make_double2
(
size
,
size
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
stepSize
->
upload
(
&
lastStepSize
);
else
{
float2
lastStepSizeFloat
=
make_float2
((
float
)
size
,
(
float
)
size
);
stepSize
->
upload
(
&
lastStepSizeFloat
);
}
}
}
double
CudaIntegrationUtilities
::
getLastStepSize
()
{
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
stepSize
->
download
(
&
lastStepSize
);
else
{
float2
lastStepSizeFloat
;
stepSize
->
download
(
&
lastStepSizeFloat
);
lastStepSize
=
make_double2
(
lastStepSizeFloat
.
x
,
lastStepSizeFloat
.
y
);
}
return
lastStepSize
.
y
;
}
void
CudaIntegrationUtilities
::
applyConstraints
(
double
tol
)
{
applyConstraints
(
false
,
tol
);
}
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
2d2f05ce
...
...
@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
if
(
includeEnergy
)
{
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
double
*
energy
=
(
double
*
)
cu
.
getPinnedBuffer
();
energyArray
.
download
(
energy
);
for
(
int
i
=
0
;
i
<
energyArray
.
getSize
();
i
++
)
...
...
@@ -1458,16 +1458,24 @@ private:
class
CudaCalcNonbondedForceKernel
::
SyncStreamPostComputation
:
public
CudaContext
::
ForcePostComputation
{
public:
SyncStreamPostComputation
(
CudaContext
&
cu
,
CUevent
event
,
int
forceGroup
)
:
cu
(
cu
),
event
(
event
),
forceGroup
(
forceGroup
)
{
SyncStreamPostComputation
(
CudaContext
&
cu
,
CUevent
event
,
CUfunction
addEnergyKernel
,
CudaArray
&
pmeEnergyBuffer
,
int
forceGroup
)
:
cu
(
cu
),
event
(
event
),
addEnergyKernel
(
addEnergyKernel
),
pmeEnergyBuffer
(
pmeEnergyBuffer
),
forceGroup
(
forceGroup
)
{
}
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
if
((
groups
&
(
1
<<
forceGroup
))
!=
0
)
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
if
(
includeEnergy
)
{
int
bufferSize
=
pmeEnergyBuffer
.
getSize
();
void
*
args
[]
=
{
&
pmeEnergyBuffer
.
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
bufferSize
};
cu
.
executeKernel
(
addEnergyKernel
,
args
,
bufferSize
);
}
return
0.0
;
}
private:
CudaContext
&
cu
;
CUevent
event
;
CUfunction
addEnergyKernel
;
CudaArray
&
pmeEnergyBuffer
;
int
forceGroup
;
};
...
...
@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeAtomRange
;
if
(
pmeAtomGridIndex
!=
NULL
)
delete
pmeAtomGridIndex
;
if
(
pmeEnergyBuffer
!=
NULL
)
delete
pmeEnergyBuffer
;
if
(
sort
!=
NULL
)
delete
sort
;
if
(
fft
!=
NULL
)
...
...
@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
int
energyElementSize
=
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeEnergyBuffer
=
new
CudaArray
(
cu
,
cu
.
getNumThreadBlocks
()
*
CudaContext
::
ThreadBlockSize
,
energyElementSize
,
"pmeEnergyBuffer"
);
cu
.
clearBuffer
(
*
pmeEnergyBuffer
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
int
cufftVersion
;
cufftGetVersion
(
&
cufftVersion
);
...
...
@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
recipForceGroup
<
0
)
recipForceGroup
=
force
.
getForceGroup
();
cu
.
addPreComputation
(
new
SyncStreamPreComputation
(
cu
,
pmeStream
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
cu
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
cu
,
pmeSyncEvent
,
cu
.
getKernel
(
module
,
"addEnergy"
),
*
pmeEnergyBuffer
,
recipForceGroup
));
}
hasInitializedFFT
=
true
;
...
...
@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
}
if
(
includeEnergy
)
{
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
usePmeStream
?
&
pmeEnergyBuffer
->
getDevicePointer
()
:
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeEvalEnergyKernel
,
computeEnergyArgs
,
cu
.
getNumAtoms
());
...
...
@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
verlet
,
defines
,
""
);
kernel1
=
cu
.
getKernel
(
module
,
"integrateVerletPart1"
);
kernel2
=
cu
.
getKernel
(
module
,
"integrateVerletPart2"
);
prevStepSize
=
-
1.0
;
}
void
CudaIntegrateVerletStepKernel
::
execute
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
)
{
...
...
@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
int
numAtoms
=
cu
.
getNumAtoms
();
int
paddedNumAtoms
=
cu
.
getPaddedNumAtoms
();
double
dt
=
integrator
.
getStepSize
();
if
(
dt
!=
prevStepSize
)
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
())
{
vector
<
double2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
make_double2
(
dt
,
dt
);
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
else
{
vector
<
float2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
make_float2
((
float
)
dt
,
(
float
)
dt
);
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
prevStepSize
=
dt
;
}
cu
.
getIntegrationUtilities
().
setNextStepSize
(
dt
);
// Call the first integration kernel.
...
...
@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
double
temperature
=
integrator
.
getTemperature
();
double
friction
=
integrator
.
getFriction
();
double
stepSize
=
integrator
.
getStepSize
();
cu
.
getIntegrationUtilities
().
setNextStepSize
(
stepSize
);
if
(
temperature
!=
prevTemp
||
friction
!=
prevFriction
||
stepSize
!=
prevStepSize
)
{
// Calculate the integration parameters.
...
...
@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p
[
1
]
=
fscale
;
p
[
2
]
=
noisescale
;
params
->
upload
(
p
);
double2
ss
=
make_double2
(
0
,
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
else
{
vector
<
float
>
p
(
params
->
getSize
());
...
...
@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p
[
1
]
=
(
float
)
fscale
;
p
[
2
]
=
(
float
)
noisescale
;
params
->
upload
(
p
);
float2
ss
=
make_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
prevTemp
=
temperature
;
prevFriction
=
friction
;
...
...
@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Update the time and step count.
double
dt
,
time
;
double
dt
=
cu
.
getIntegrationUtilities
().
getLastStepSize
();
double
time
=
cu
.
getTime
()
+
dt
;
if
(
useDouble
)
{
double2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
}
else
{
float2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
}
...
...
@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Update the time and step count.
double
dt
,
time
;
double
dt
=
cu
.
getIntegrationUtilities
().
getLastStepSize
();
double
time
=
cu
.
getTime
()
+
dt
;
if
(
useDouble
)
{
double2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
}
else
{
float2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
}
...
...
@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
summedValue
=
new
CudaArray
(
cu
,
1
,
elementSize
,
"summedValue"
);
perDofValues
=
new
CudaParameterSet
(
cu
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
());
cu
.
addReorderListener
(
new
ReorderListener
(
cu
,
*
perDofValues
,
localPerDofValuesFloat
,
localPerDofValuesDouble
,
deviceValuesAreCurrent
));
prevStepSize
=
-
1.0
;
SimTKOpenMMUtilities
::
setRandomNumberSeed
(
integrator
.
getRandomNumberSeed
());
}
...
...
@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
}
localValuesAreCurrent
=
false
;
double
stepSize
=
integrator
.
getStepSize
();
if
(
stepSize
!=
prevStepSize
)
{
recordGlobalValue
(
stepSize
,
GlobalTarget
(
DT
,
dtVariableIndex
));
}
recordGlobalValue
(
stepSize
,
GlobalTarget
(
DT
,
dtVariableIndex
));
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
globalValuesDouble
[
parameterVariableIndex
[
i
]])
{
...
...
@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
void
CudaIntegrateCustomStepKernel
::
recordGlobalValue
(
double
value
,
GlobalTarget
target
)
{
switch
(
target
.
type
)
{
case
DT
:
if
(
value
!=
globalValuesDouble
[
dtVariableIndex
])
deviceGlobalsAreCurrent
=
false
;
globalValuesDouble
[
dtVariableIndex
]
=
value
;
deviceGlobalsAreCurrent
=
false
;
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
())
{
double
size
[]
=
{
0
,
value
};
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
size
);
}
else
{
float
size
[]
=
{
0
,
(
float
)
value
};
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
size
);
}
prevStepSize
=
value
;
cu
.
getIntegrationUtilities
().
setNextStepSize
(
value
);
break
;
case
VARIABLE
:
case
PARAMETER
:
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
2d2f05ce
...
...
@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
}
CudaPlatform
::
PlatformData
::
PlatformData
(
ContextImpl
*
context
,
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
const
string
&
cpuPmeProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
,
const
string
&
hostCompilerProperty
)
:
context
(
context
),
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
const
string
&
cpuPmeProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
,
const
string
&
hostCompilerProperty
)
:
context
(
context
),
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
,
hasInitializedContexts
(
false
)
{
bool
blocking
=
(
blockingProperty
==
"true"
);
vector
<
string
>
devices
;
size_t
searchPos
=
0
,
nextPos
;
...
...
@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
if
(
hasInitializedContexts
)
return
;
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
hasInitializedContexts
=
true
;
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
...
...
platforms/cuda/src/kernels/customCentroidBond.cu
View file @
2d2f05ce
...
...
@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
* Compute the forces on groups based on the bonds.
*/
extern
"C"
__global__
void
computeGroupForces
(
unsigned
long
long
*
__restrict__
groupForce
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
centerPositions
,
extern
"C"
__global__
void
computeGroupForces
(
unsigned
long
long
*
__restrict__
groupForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
centerPositions
,
const
int
*
__restrict__
bondGroups
EXTRA_ARGS
)
{
real
energy
=
0
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_BONDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
COMPUTE_FORCE
}
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
2d2f05ce
...
...
@@ -13,7 +13,7 @@ typedef struct {
/**
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
...
...
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
mixed
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
...
...
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
View file @
2d2f05ce
...
...
@@ -2,9 +2,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
mixed
energy
=
0
;
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the derivatives
...
...
platforms/cuda/src/kernels/customHbondForce.cu
View file @
2d2f05ce
...
...
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
* Compute forces on donors.
*/
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
real
energy
=
0
;
mixed
energy
=
0
;
real3
f1
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
...
...
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
/**
* Compute forces on acceptors.
*/
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
2d2f05ce
...
...
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
* Compute the interaction.
*/
extern
"C"
__global__
void
computeInteraction
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
...
...
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
#endif
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
.0
f
;
mixed
energy
=
0
;
// Loop over particles to be the first one in the set.
...
...
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
2d2f05ce
...
...
@@ -9,14 +9,14 @@ typedef struct {
}
AtomData
;
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
real
energy
=
0
.0
f
;
mixed
energy
=
0
;
__shared__
AtomData
localData
[
LOCAL_MEMORY_SIZE
];
const
unsigned
int
startTile
=
FIRST_TILE
+
warp
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
;
...
...
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment