Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2d2f05ce
Commit
2d2f05ce
authored
Dec 08, 2015
by
Andy Simmonett
Browse files
Merge branch 'master' of github.com:pandegroup/openmm into genpt
parents
94823d84
4d32047c
Changes
235
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
263 additions
and
198 deletions
+263
-198
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+40
-23
platforms/cpu/src/CpuSETTLE.cpp
platforms/cpu/src/CpuSETTLE.cpp
+20
-9
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
+5
-22
platforms/cuda/CMakeLists.txt
platforms/cuda/CMakeLists.txt
+2
-2
platforms/cuda/include/CudaContext.h
platforms/cuda/include/CudaContext.h
+6
-0
platforms/cuda/include/CudaIntegrationUtilities.h
platforms/cuda/include/CudaIntegrationUtilities.h
+9
-0
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+3
-3
platforms/cuda/include/CudaPlatform.h
platforms/cuda/include/CudaPlatform.h
+1
-1
platforms/cuda/src/CudaBondedUtilities.cpp
platforms/cuda/src/CudaBondedUtilities.cpp
+2
-2
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+96
-57
platforms/cuda/src/CudaExpressionUtilities.cpp
platforms/cuda/src/CudaExpressionUtilities.cpp
+8
-7
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+27
-4
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+27
-54
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+4
-1
platforms/cuda/src/kernels/customCentroidBond.cu
platforms/cuda/src/kernels/customCentroidBond.cu
+2
-2
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+2
-2
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+2
-2
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+3
-3
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+2
-2
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+2
-2
No files found.
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
2d2f05ce
...
@@ -28,7 +28,7 @@
...
@@ -28,7 +28,7 @@
#include "CpuNonbondedForce.h"
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "ReferencePME.h"
#include "gmx_atomic.h"
#include "
openmm/internal/
gmx_atomic.h"
#include <algorithm>
#include <algorithm>
// In case we're using some primitive version of Visual Studio this will
// In case we're using some primitive version of Visual Studio this will
...
@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
...
@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
threads
.
execute
(
task
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
threads
.
waitForThreads
();
// Signal the threads to subtract the exclusions.
if
(
ewald
||
pme
)
{
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
waitForThreads
();
}
// Combine the energies from all the threads.
// Combine the energies from all the threads.
if
(
totalEnergy
!=
NULL
)
{
if
(
totalEnergy
!=
NULL
)
{
...
@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
...
@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
for
(
int
i
=
threadIndex
;
i
<
numberOfAtoms
;
i
+=
numThreads
)
{
threads
.
syncThreads
();
fvec4
posI
((
float
)
atomCoordinates
[
i
][
0
],
(
float
)
atomCoordinates
[
i
][
1
],
(
float
)
atomCoordinates
[
i
][
2
],
0.0
f
);
const
int
groupSize
=
max
(
1
,
numberOfAtoms
/
(
10
*
numThreads
));
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
while
(
true
)
{
if
(
*
iter
>
i
)
{
int
start
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
groupSize
);
int
j
=
*
iter
;
if
(
start
>=
numberOfAtoms
)
fvec4
deltaR
;
break
;
fvec4
posJ
((
float
)
atomCoordinates
[
j
][
0
],
(
float
)
atomCoordinates
[
j
][
1
],
(
float
)
atomCoordinates
[
j
][
2
],
0.0
f
);
int
end
=
min
(
start
+
groupSize
,
numberOfAtoms
);
float
r2
;
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
fvec4
posI
((
float
)
atomCoordinates
[
i
][
0
],
(
float
)
atomCoordinates
[
i
][
1
],
(
float
)
atomCoordinates
[
i
][
2
],
0.0
f
);
float
r
=
sqrtf
(
r2
);
float
scaledChargeI
=
(
float
)
(
ONE_4PI_EPS0
*
posq
[
4
*
i
+
3
]);
float
inverseR
=
1
/
r
;
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
float
chargeProd
=
ONE_4PI_EPS0
*
posq
[
4
*
i
+
3
]
*
posq
[
4
*
j
+
3
];
if
(
*
iter
>
i
)
{
float
alphaR
=
alphaEwald
*
r
;
int
j
=
*
iter
;
float
erfAlphaR
=
erf
(
alphaR
);
fvec4
deltaR
;
if
(
erfAlphaR
>
1e-6
f
)
{
fvec4
posJ
((
float
)
atomCoordinates
[
j
][
0
],
(
float
)
atomCoordinates
[
j
][
1
],
(
float
)
atomCoordinates
[
j
][
2
],
0.0
f
);
float
dEdR
=
(
float
)
(
chargeProd
*
inverseR
*
inverseR
*
inverseR
);
float
r2
;
dEdR
=
(
float
)
(
dEdR
*
(
erfAlphaR
-
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
)));
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
fvec4
result
=
deltaR
*
dEdR
;
float
r
=
sqrtf
(
r2
);
(
fvec4
(
forces
+
4
*
i
)
-
result
).
store
(
forces
+
4
*
i
);
float
alphaR
=
alphaEwald
*
r
;
(
fvec4
(
forces
+
4
*
j
)
+
result
).
store
(
forces
+
4
*
j
);
float
erfAlphaR
=
erf
(
alphaR
);
if
(
includeEnergy
)
if
(
erfAlphaR
>
1e-6
f
)
{
threadEnergy
[
threadIndex
]
-=
chargeProd
*
inverseR
*
erfAlphaR
;
float
inverseR
=
1
/
r
;
float
chargeProdOverR
=
scaledChargeI
*
posq
[
4
*
j
+
3
]
*
inverseR
;
float
dEdR
=
chargeProdOverR
*
inverseR
*
inverseR
;
dEdR
=
dEdR
*
(
erfAlphaR
-
(
float
)
TWO_OVER_SQRT_PI
*
alphaR
*
(
float
)
exp
(
-
alphaR
*
alphaR
));
fvec4
result
=
deltaR
*
dEdR
;
(
fvec4
(
forces
+
4
*
i
)
-
result
).
store
(
forces
+
4
*
i
);
(
fvec4
(
forces
+
4
*
j
)
+
result
).
store
(
forces
+
4
*
j
);
if
(
includeEnergy
)
threadEnergy
[
threadIndex
]
-=
chargeProdOverR
*
erfAlphaR
;
}
}
}
}
}
}
}
...
...
platforms/cpu/src/CpuSETTLE.cpp
View file @
2d2f05ce
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2013 Stanford University and the Authors.
*
* Portions copyright (c) 2013
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
#include "CpuSETTLE.h"
#include "openmm/internal/gmx_atomic.h"
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
...
@@ -39,10 +40,14 @@ public:
...
@@ -39,10 +40,14 @@ public:
ApplyToPositionsTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
ApplyToPositionsTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
atomCoordinatesP
(
atomCoordinatesP
),
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
atomCoordinatesP
(
atomCoordinatesP
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
gmx_atomic_set
(
&
atomicCounter
,
0
);
}
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
if
(
threadIndex
<
threadSettle
.
size
())
{
while
(
true
)
{
threadSettle
[
threadIndex
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
int
index
=
gmx_atomic_fetch_add
(
&
atomicCounter
,
1
);
if
(
index
>=
threadSettle
.
size
())
break
;
threadSettle
[
index
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
}
}
}
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
...
@@ -50,6 +55,7 @@ public:
...
@@ -50,6 +55,7 @@ public:
vector
<
RealOpenMM
>&
inverseMasses
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
gmx_atomic_t
atomicCounter
;
};
};
class
CpuSETTLE
::
ApplyToVelocitiesTask
:
public
ThreadPool
::
Task
{
class
CpuSETTLE
::
ApplyToVelocitiesTask
:
public
ThreadPool
::
Task
{
...
@@ -57,10 +63,14 @@ public:
...
@@ -57,10 +63,14 @@ public:
ApplyToVelocitiesTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
ApplyToVelocitiesTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
velocities
(
velocities
),
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
velocities
(
velocities
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
gmx_atomic_set
(
&
atomicCounter
,
0
);
}
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
if
(
threadIndex
<
threadSettle
.
size
())
{
while
(
true
)
{
threadSettle
[
threadIndex
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
int
index
=
gmx_atomic_fetch_add
(
&
atomicCounter
,
1
);
if
(
index
>=
threadSettle
.
size
())
break
;
threadSettle
[
index
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
}
}
}
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
...
@@ -68,17 +78,18 @@ public:
...
@@ -68,17 +78,18 @@ public:
vector
<
RealOpenMM
>&
inverseMasses
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
gmx_atomic_t
atomicCounter
;
};
};
CpuSETTLE
::
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
)
:
threads
(
threads
)
{
CpuSETTLE
::
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
)
:
threads
(
threads
)
{
int
num
Thread
s
=
threads
.
getNumThreads
();
int
num
Block
s
=
10
*
threads
.
getNumThreads
();
int
numClusters
=
settle
.
getNumClusters
();
int
numClusters
=
settle
.
getNumClusters
();
vector
<
RealOpenMM
>
mass
(
system
.
getNumParticles
());
vector
<
RealOpenMM
>
mass
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
mass
[
i
]
=
system
.
getParticleMass
(
i
);
mass
[
i
]
=
system
.
getParticleMass
(
i
);
for
(
int
i
=
0
;
i
<
num
Thread
s
;
i
++
)
{
for
(
int
i
=
0
;
i
<
num
Block
s
;
i
++
)
{
int
start
=
i
*
numClusters
/
num
Thread
s
;
int
start
=
i
*
numClusters
/
num
Block
s
;
int
end
=
(
i
+
1
)
*
numClusters
/
num
Thread
s
;
int
end
=
(
i
+
1
)
*
numClusters
/
num
Block
s
;
if
(
start
!=
end
)
{
if
(
start
!=
end
)
{
int
numThreadClusters
=
end
-
start
;
int
numThreadClusters
=
end
-
start
;
vector
<
int
>
atom1
(
numThreadClusters
),
atom2
(
numThreadClusters
),
atom3
(
numThreadClusters
);
vector
<
int
>
atom1
(
numThreadClusters
),
atom2
(
numThreadClusters
),
atom3
(
numThreadClusters
);
...
...
serialization/include/openmm/serialization/GBVIForceProxy.h
→
platforms/cpu/tests/TestCpuCompoundIntegrator.cpp
View file @
2d2f05ce
#ifndef OPENMM_GBVIFORCE_PROXY_H_
#define OPENMM_GBVIFORCE_PROXY_H_
/* -------------------------------------------------------------------------- *
/* -------------------------------------------------------------------------- *
* OpenMM *
* OpenMM *
* -------------------------------------------------------------------------- *
* -------------------------------------------------------------------------- *
...
@@ -9,7 +6,7 @@
...
@@ -9,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 201
0
Stanford University and the Authors. *
* Portions copyright (c) 201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -32,22 +29,8 @@
...
@@ -32,22 +29,8 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include "CpuTests.h"
#include "openmm/serialization/SerializationProxy.h"
#include "TestCompoundIntegrator.h"
namespace
OpenMM
{
/**
* This is a proxy for serializing GBVIForce objects.
*/
class
OPENMM_EXPORT
GBVIForceProxy
:
public
SerializationProxy
{
public:
GBVIForceProxy
();
void
serialize
(
const
void
*
object
,
SerializationNode
&
node
)
const
;
void
*
deserialize
(
const
SerializationNode
&
node
)
const
;
};
}
// namespace OpenMM
#endif
/*OPENMM_GBVIFORCE_PROXY_H_*/
void
runPlatformTests
()
{
}
platforms/cuda/CMakeLists.txt
View file @
2d2f05ce
...
@@ -13,9 +13,9 @@
...
@@ -13,9 +13,9 @@
#----------------------------------------------------
#----------------------------------------------------
set
(
OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL
"Whether to build CUDA test cases"
)
set
(
OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL
"Whether to build CUDA test cases"
)
if
(
OPENMM_BUILD_CUDA_TESTS
)
if
(
BUILD_TESTING AND
OPENMM_BUILD_CUDA_TESTS
)
SUBDIRS
(
tests
)
SUBDIRS
(
tests
)
endif
(
OPENMM_BUILD_CUDA_TESTS
)
endif
(
BUILD_TESTING AND
OPENMM_BUILD_CUDA_TESTS
)
# The source is organized into subdirectories, but we handle them all from
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
...
...
platforms/cuda/include/CudaContext.h
View file @
2d2f05ce
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
#include <map>
#include <map>
#include <queue>
#include <queue>
#include <string>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
// Prevent Windows from defining macros that interfere with other code.
...
@@ -538,6 +539,11 @@ public:
...
@@ -538,6 +539,11 @@ public:
*/
*/
void
invalidateMolecules
();
void
invalidateMolecules
();
private:
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std
::
vector
<
int
>
getDevicePrecedence
();
struct
Molecule
;
struct
Molecule
;
struct
MoleculeGroup
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
class
VirtualSiteInfo
;
...
...
platforms/cuda/include/CudaIntegrationUtilities.h
View file @
2d2f05ce
...
@@ -62,6 +62,14 @@ public:
...
@@ -62,6 +62,14 @@ public:
CudaArray
&
getStepSize
()
{
CudaArray
&
getStepSize
()
{
return
*
stepSize
;
return
*
stepSize
;
}
}
/**
* Set the size to use for the next step.
*/
void
setNextStepSize
(
double
size
);
/**
* Get the size that was used for the last step.
*/
double
getLastStepSize
();
/**
/**
* Apply constraints to the atom positions.
* Apply constraints to the atom positions.
*
*
...
@@ -154,6 +162,7 @@ private:
...
@@ -154,6 +162,7 @@ private:
CudaArray
*
vsiteLocalCoordsParams
;
CudaArray
*
vsiteLocalCoordsParams
;
int
randomPos
;
int
randomPos
;
int
lastSeed
,
numVsites
;
int
lastSeed
,
numVsites
;
double2
lastStepSize
;
struct
ShakeCluster
;
struct
ShakeCluster
;
struct
ConstraintOrderer
;
struct
ConstraintOrderer
;
};
};
...
...
platforms/cuda/include/CudaKernels.h
View file @
2d2f05ce
...
@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
...
@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
pmeEnergyBuffer
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
}
}
~
CudaCalcNonbondedForceKernel
();
~
CudaCalcNonbondedForceKernel
();
/**
/**
...
@@ -657,6 +657,7 @@ private:
...
@@ -657,6 +657,7 @@ private:
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeEnergyBuffer
;
CudaSort
*
sort
;
CudaSort
*
sort
;
Kernel
cpuPme
;
Kernel
cpuPme
;
PmeIO
*
pmeio
;
PmeIO
*
pmeio
;
...
@@ -1123,7 +1124,6 @@ public:
...
@@ -1123,7 +1124,6 @@ public:
double
computeKineticEnergy
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
);
double
computeKineticEnergy
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
);
private:
private:
CudaContext
&
cu
;
CudaContext
&
cu
;
double
prevStepSize
;
CUfunction
kernel1
,
kernel2
;
CUfunction
kernel1
,
kernel2
;
};
};
...
@@ -1354,7 +1354,7 @@ private:
...
@@ -1354,7 +1354,7 @@ private:
void
recordChangedParameters
(
ContextImpl
&
context
);
void
recordChangedParameters
(
ContextImpl
&
context
);
bool
evaluateCondition
(
int
step
);
bool
evaluateCondition
(
int
step
);
CudaContext
&
cu
;
CudaContext
&
cu
;
double
prevStepSize
,
energy
;
double
energy
;
float
energyFloat
;
float
energyFloat
;
int
numGlobalVariables
;
int
numGlobalVariables
;
bool
hasInitializedKernels
,
deviceValuesAreCurrent
,
deviceGlobalsAreCurrent
,
modifiesParameters
,
keNeedsForce
,
hasAnyConstraints
;
bool
hasInitializedKernels
,
deviceValuesAreCurrent
,
deviceGlobalsAreCurrent
,
modifiesParameters
,
keNeedsForce
,
hasAnyConstraints
;
...
...
platforms/cuda/include/CudaPlatform.h
View file @
2d2f05ce
...
@@ -121,7 +121,7 @@ public:
...
@@ -121,7 +121,7 @@ public:
ContextImpl
*
context
;
ContextImpl
*
context
;
std
::
vector
<
CudaContext
*>
contexts
;
std
::
vector
<
CudaContext
*>
contexts
;
std
::
vector
<
double
>
contextEnergy
;
std
::
vector
<
double
>
contextEnergy
;
bool
removeCM
,
peerAccessSupported
,
useCpuPme
;
bool
hasInitializedContexts
,
removeCM
,
peerAccessSupported
,
useCpuPme
;
int
cmMotionFrequency
;
int
cmMotionFrequency
;
int
stepCount
,
computeForceCount
;
int
stepCount
,
computeForceCount
;
double
time
;
double
time
;
...
...
platforms/cuda/src/CudaBondedUtilities.cpp
View file @
2d2f05ce
...
@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
...
@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s
<<
CudaKernelSources
::
vectorOps
;
s
<<
CudaKernelSources
::
vectorOps
;
for
(
int
i
=
0
;
i
<
(
int
)
prefixCode
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
prefixCode
.
size
();
i
++
)
s
<<
prefixCode
[
i
];
s
<<
prefixCode
[
i
];
s
<<
"extern
\"
C
\"
__global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer,
real
* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"
;
s
<<
"extern
\"
C
\"
__global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer,
mixed
* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"
;
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
{
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
atomIndices
[
force
].
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
atomIndices
[
force
].
size
();
i
++
)
{
int
indexWidth
=
atomIndices
[
force
][
i
]
->
getElementSize
()
/
4
;
int
indexWidth
=
atomIndices
[
force
][
i
]
->
getElementSize
()
/
4
;
...
@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
...
@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
for
(
int
i
=
0
;
i
<
(
int
)
arguments
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
arguments
.
size
();
i
++
)
s
<<
", "
<<
argTypes
[
i
]
<<
"* customArg"
<<
(
i
+
1
);
s
<<
", "
<<
argTypes
[
i
]
<<
"* customArg"
<<
(
i
+
1
);
s
<<
") {
\n
"
;
s
<<
") {
\n
"
;
s
<<
"
real
energy = 0;
\n
"
;
s
<<
"
mixed
energy = 0;
\n
"
;
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
for
(
int
force
=
0
;
force
<
numForces
;
force
++
)
s
<<
createForceSource
(
force
,
forceAtoms
[
force
].
size
(),
forceAtoms
[
force
][
0
].
size
(),
forceGroup
[
force
],
forceSource
[
force
]);
s
<<
createForceSource
(
force
,
forceAtoms
[
force
].
size
(),
forceAtoms
[
force
][
0
].
size
(),
forceGroup
[
force
],
forceSource
[
force
]);
s
<<
"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
\n
"
;
s
<<
"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
\n
"
;
...
...
platforms/cuda/src/CudaContext.cpp
View file @
2d2f05ce
...
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
if
(
deviceIndex
<
-
1
||
deviceIndex
>=
numDevices
)
if
(
deviceIndex
<
-
1
||
deviceIndex
>=
numDevices
)
throw
OpenMMException
(
"Illegal value for CudaDeviceIndex: "
+
intToString
(
deviceIndex
));
throw
OpenMMException
(
"Illegal value for CudaDeviceIndex: "
+
intToString
(
deviceIndex
));
vector
<
int
>
devicePrecedence
;
if
(
deviceIndex
==
-
1
)
{
if
(
deviceIndex
==
-
1
)
{
// Try to figure out which device is the fastest.
devicePrecedence
=
getDevicePrecedence
();
}
else
{
int
bestSpeed
=
-
1
;
devicePrecedence
.
push_back
(
deviceIndex
);
int
bestCompute
=
-
1
;
}
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
i
));
this
->
deviceIndex
=
-
1
;
int
major
,
minor
,
clock
,
multiprocessors
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devicePrecedence
.
size
());
i
++
)
{
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
int
trialDeviceIndex
=
devicePrecedence
[
i
];
if
(
major
==
1
&&
minor
<
2
)
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
trialDeviceIndex
));
continue
;
// 1.0 and 1.1 are not supported
defaultOptimizationOptions
=
"--use_fast_math"
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
device
));
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
if
(
useBlockingSync
)
int
speed
=
clock
*
multiprocessors
;
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
if
(
major
>
bestCompute
||
(
major
==
bestCompute
&&
speed
>
bestSpeed
))
{
else
deviceIndex
=
i
;
flags
+=
CU_CTX_SCHED_SPIN
;
bestSpeed
=
speed
;
bestCompute
=
major
;
if
(
cuCtxCreate
(
&
context
,
flags
,
device
)
==
CUDA_SUCCESS
)
{
}
this
->
deviceIndex
=
trialDeviceIndex
;
break
;
}
}
}
}
if
(
deviceIndex
==
-
1
)
if
(
this
->
deviceIndex
==
-
1
)
throw
OpenMMException
(
"No compatible CUDA device is available"
);
if
(
deviceIndex
!=
-
1
)
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
throw
OpenMMException
(
"The requested CUDA device could not be loaded"
);
this
->
deviceIndex
=
deviceIndex
;
else
throw
OpenMMException
(
"No compatible CUDA device is available"
);
int
major
,
minor
;
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
#if __CUDA_API_VERSION < 7000
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// CUDA 7.0 is released.
// its compute capability as 5.2, but the compiler doesn't support
if
(
major
==
5
)
// anything beyond 5.0.
minor
=
0
;
if
(
major
==
5
)
minor
=
0
;
#endif
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
computeCapability
=
major
+
0.1
*
minor
;
computeCapability
=
major
+
0.1
*
minor
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
computeCapability
<
1.3
)
throw
OpenMMException
(
"This device does not support double precision"
);
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
if
(
contextIndex
>
0
)
{
...
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines
[
"ATAN"
]
=
useDoublePrecision
?
"atan"
:
"atanf"
;
compilationDefines
[
"ATAN"
]
=
useDoublePrecision
?
"atan"
:
"atanf"
;
compilationDefines
[
"ERF"
]
=
useDoublePrecision
?
"erf"
:
"erff"
;
compilationDefines
[
"ERF"
]
=
useDoublePrecision
?
"erf"
:
"erff"
;
compilationDefines
[
"ERFC"
]
=
useDoublePrecision
?
"erfc"
:
"erfcf"
;
compilationDefines
[
"ERFC"
]
=
useDoublePrecision
?
"erfc"
:
"erfcf"
;
// Set defines for applying periodic boundary conditions.
// Set defines for applying periodic boundary conditions.
Vec3
boxVectors
[
3
];
Vec3
boxVectors
[
3
];
system
.
getDefaultPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
system
.
getDefaultPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
boxIsTriclinic
=
(
boxVectors
[
0
][
1
]
!=
0.0
||
boxVectors
[
0
][
2
]
!=
0.0
||
boxIsTriclinic
=
(
boxVectors
[
0
][
1
]
!=
0.0
||
boxVectors
[
0
][
2
]
!=
0.0
||
...
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
}
// Create the work thread used for parallelization when running on multiple devices.
// Create the work thread used for parallelization when running on multiple devices.
thread
=
new
WorkThread
();
thread
=
new
WorkThread
();
// Create utilities objects.
// Create utilities objects.
bonded
=
new
CudaBondedUtilities
(
*
this
);
bonded
=
new
CudaBondedUtilities
(
*
this
);
nonbonded
=
new
CudaNonbondedUtilities
(
*
this
);
nonbonded
=
new
CudaNonbondedUtilities
(
*
this
);
integration
=
new
CudaIntegrationUtilities
(
*
this
,
system
);
integration
=
new
CudaIntegrationUtilities
(
*
this
,
system
);
...
@@ -368,7 +367,7 @@ void CudaContext::initialize() {
...
@@ -368,7 +367,7 @@ void CudaContext::initialize() {
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
}
else
if
(
useMixedPrecision
)
{
else
if
(
useMixedPrecision
)
{
energyBuffer
=
CudaArray
::
create
<
float
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
energyBuffer
=
CudaArray
::
create
<
double
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
4
,
numEnergyBuffers
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
4
,
numEnergyBuffers
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
}
...
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
...
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if
(
index
!=
result
.
npos
)
{
if
(
index
!=
result
.
npos
)
{
if
((
index
==
0
||
symbolChars
.
find
(
result
[
index
-
1
])
==
symbolChars
.
end
())
&&
(
index
==
result
.
size
()
-
size
||
symbolChars
.
find
(
result
[
index
+
size
])
==
symbolChars
.
end
()))
{
if
((
index
==
0
||
symbolChars
.
find
(
result
[
index
-
1
])
==
symbolChars
.
end
())
&&
(
index
==
result
.
size
()
-
size
||
symbolChars
.
find
(
result
[
index
+
size
])
==
symbolChars
.
end
()))
{
// We have found a complete symbol, not part of a longer symbol.
// We have found a complete symbol, not part of a longer symbol.
result
.
replace
(
index
,
size
,
iter
->
second
);
result
.
replace
(
index
,
size
,
iter
->
second
);
index
+=
iter
->
second
.
size
();
index
+=
iter
->
second
.
size
();
}
}
...
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
...
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return
-
1
;
return
-
1
;
}
}
WaitForSingleObject
(
pi
.
hProcess
,
INFINITE
);
WaitForSingleObject
(
pi
.
hProcess
,
INFINITE
);
DWORD
exitCode
=
-
1
;
DWORD
exitCode
=
-
1
;
if
(
!
GetExitCodeProcess
(
pi
.
hProcess
,
&
exitCode
))
{
if
(
!
GetExitCodeProcess
(
pi
.
hProcess
,
&
exitCode
))
{
throw
(
OpenMMException
(
"Could not get nvcc.exe's exit code
\n
"
));
throw
(
OpenMMException
(
"Could not get nvcc.exe's exit code
\n
"
));
}
else
{
}
else
{
if
(
exitCode
==
0
)
if
(
exitCode
==
0
)
return
0
;
return
0
;
else
else
return
-
1
;
return
-
1
;
...
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if
(
!
defines
.
empty
())
if
(
!
defines
.
empty
())
src
<<
endl
;
src
<<
endl
;
src
<<
source
<<
endl
;
src
<<
source
<<
endl
;
// See whether we already have PTX for this kernel cached.
// See whether we already have PTX for this kernel cached.
CSHA1
sha1
;
CSHA1
sha1
;
sha1
.
Update
((
const
UINT_8
*
)
src
.
str
().
c_str
(),
src
.
str
().
size
());
sha1
.
Update
((
const
UINT_8
*
)
src
.
str
().
c_str
(),
src
.
str
().
size
());
sha1
.
Final
();
sha1
.
Final
();
...
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule
module
;
CUmodule
module
;
if
(
cuModuleLoad
(
&
module
,
cacheFile
.
str
().
c_str
())
==
CUDA_SUCCESS
)
if
(
cuModuleLoad
(
&
module
,
cacheFile
.
str
().
c_str
())
==
CUDA_SUCCESS
)
return
module
;
return
module
;
// Select names for the various temporary files.
// Select names for the various temporary files.
stringstream
tempFileName
;
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
#ifdef WIN32
...
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int
res
=
0
;
int
res
=
0
;
// If the runtime compiler plugin is available, use it.
// If the runtime compiler plugin is available, use it.
if
(
hasCompilerKernel
)
{
if
(
hasCompilerKernel
)
{
string
ptx
=
compilerKernel
.
getAs
<
CudaCompilerKernel
>
().
createModule
(
src
.
str
(),
"-arch=compute_"
+
gpuArchitecture
+
" "
+
options
,
*
this
);
string
ptx
=
compilerKernel
.
getAs
<
CudaCompilerKernel
>
().
createModule
(
src
.
str
(),
"-arch=compute_"
+
gpuArchitecture
+
" "
+
options
,
*
this
);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool
wroteCache
=
false
;
bool
wroteCache
=
false
;
try
{
try
{
ofstream
out
(
outputFile
.
c_str
());
ofstream
out
(
outputFile
.
c_str
());
...
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
}
if
(
!
wroteCache
)
{
if
(
!
wroteCache
)
{
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2
(
cuModuleLoadDataEx
(
&
module
,
&
ptx
[
0
],
0
,
NULL
,
NULL
),
"Error loading CUDA module"
);
CHECK_RESULT2
(
cuModuleLoadDataEx
(
&
module
,
&
ptx
[
0
],
0
,
NULL
,
NULL
),
"Error loading CUDA module"
);
return
module
;
return
module
;
}
}
...
@@ -883,7 +882,7 @@ private:
...
@@ -883,7 +882,7 @@ private:
void
CudaContext
::
findMoleculeGroups
()
{
void
CudaContext
::
findMoleculeGroups
()
{
// The first time this is called, we need to identify all the molecules in the system.
// The first time this is called, we need to identify all the molecules in the system.
if
(
moleculeGroups
.
size
()
==
0
)
{
if
(
moleculeGroups
.
size
()
==
0
)
{
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
...
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
identical
=
false
;
identical
=
false
;
}
}
// See if the constraints are identical.
// See if the constraints are identical.
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
...
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
...
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
}
}
if
(
valid
)
if
(
valid
)
return
;
return
;
// The list of which molecules are identical is no longer valid. We need to restore the
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
// again.
vector
<
int4
>
newCellOffsets
(
numAtoms
);
vector
<
int4
>
newCellOffsets
(
numAtoms
);
if
(
useDoublePrecision
)
{
if
(
useDoublePrecision
)
{
vector
<
double4
>
oldPosq
(
paddedNumAtoms
);
vector
<
double4
>
oldPosq
(
paddedNumAtoms
);
...
@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
...
@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
molPos
[
i
].
x
*=
invNumAtoms
;
molPos
[
i
].
x
*=
invNumAtoms
;
molPos
[
i
].
y
*=
invNumAtoms
;
molPos
[
i
].
y
*=
invNumAtoms
;
molPos
[
i
].
z
*=
invNumAtoms
;
molPos
[
i
].
z
*=
invNumAtoms
;
if
(
molPos
[
i
].
x
!=
molPos
[
i
].
x
)
throw
OpenMMException
(
"Particle coordinate is nan"
);
}
}
if
(
nonbonded
->
getUsePeriodic
())
{
if
(
nonbonded
->
getUsePeriodic
())
{
// Move each molecule position into the same box.
// Move each molecule position into the same box.
...
@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
...
@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait
(
&
queueEmptyCondition
,
&
queueLock
);
pthread_cond_wait
(
&
queueEmptyCondition
,
&
queueLock
);
pthread_mutex_unlock
(
&
queueLock
);
pthread_mutex_unlock
(
&
queueLock
);
}
}
vector
<
int
>
CudaContext
::
getDevicePrecedence
()
{
int
numDevices
;
CUdevice
thisDevice
;
string
errorMessage
=
"Error initializing Context"
;
vector
<
pair
<
pair
<
int
,
int
>
,
int
>
>
devices
;
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
thisDevice
,
i
));
int
major
,
minor
,
clock
,
multiprocessors
,
speed
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
thisDevice
));
if
(
major
==
1
&&
minor
<
2
)
continue
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
(
major
+
0.1
*
minor
<
1.3
))
continue
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
thisDevice
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
thisDevice
));
speed
=
clock
*
multiprocessors
;
pair
<
int
,
int
>
deviceProperties
=
std
::
make_pair
(
major
,
speed
);
devices
.
push_back
(
std
::
make_pair
(
deviceProperties
,
-
i
));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std
::
sort
(
devices
.
begin
(),
devices
.
end
());
std
::
reverse
(
devices
.
begin
(),
devices
.
end
());
vector
<
int
>
precedence
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devices
.
size
());
i
++
)
{
precedence
.
push_back
(
-
devices
[
i
].
second
);
}
return
precedence
;
}
platforms/cuda/src/CudaExpressionUtilities.cpp
View file @
2d2f05ce
...
@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
...
@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
}
}
out
<<
");
\n
"
;
out
<<
");
\n
"
;
out
<<
"APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)
\n
"
;
out
<<
"APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)
\n
"
;
out
<<
tempType
<<
" periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);
\n
"
;
out
<<
tempType
<<
" periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;
\n
"
;
out
<<
tempType
<<
" periodicDistance_rinv = RSQRT(periodicDistance_r2);
\n
"
;
for
(
int
j
=
0
;
j
<
nodes
.
size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
nodes
.
size
();
j
++
)
{
const
vector
<
int
>&
derivOrder
=
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
nodes
[
j
]
->
getOperation
())
->
getDerivOrder
();
const
vector
<
int
>&
derivOrder
=
dynamic_cast
<
const
Operation
::
Custom
*>
(
&
nodes
[
j
]
->
getOperation
())
->
getDerivOrder
();
int
argIndex
=
-
1
;
int
argIndex
=
-
1
;
...
@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
...
@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if
(
argIndex
==
-
1
)
if
(
argIndex
==
-
1
)
out
<<
nodeNames
[
j
]
<<
" = RECIP(periodicDistance_rinv);
\n
"
;
out
<<
nodeNames
[
j
]
<<
" = RECIP(periodicDistance_rinv);
\n
"
;
else
if
(
argIndex
==
0
)
else
if
(
argIndex
==
0
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.x*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.x*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
1
)
else
if
(
argIndex
==
1
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.y*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.y*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
2
)
else
if
(
argIndex
==
2
)
out
<<
nodeNames
[
j
]
<<
" = periodicDistance_delta.z*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
periodicDistance_delta.z*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
3
)
else
if
(
argIndex
==
3
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.x*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.x*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
4
)
else
if
(
argIndex
==
4
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.y*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.y*periodicDistance_rinv
: 0)
;
\n
"
;
else
if
(
argIndex
==
5
)
else
if
(
argIndex
==
5
)
out
<<
nodeNames
[
j
]
<<
" = -periodicDistance_delta.z*periodicDistance_rinv;
\n
"
;
out
<<
nodeNames
[
j
]
<<
" =
(periodicDistance_r2 > 0 ?
-periodicDistance_delta.z*periodicDistance_rinv
: 0)
;
\n
"
;
}
}
}
}
else
{
else
{
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
2d2f05ce
...
@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
...
@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
),
vsiteLocalCoordsAtoms
(
NULL
),
vsiteLocalCoordsParams
(
NULL
)
{
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
),
vsiteLocalCoordsAtoms
(
NULL
),
vsiteLocalCoordsParams
(
NULL
)
{
// Create workspace arrays.
// Create workspace arrays.
lastStepSize
=
make_double2
(
0.0
,
0.0
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
{
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
{
posDelta
=
CudaArray
::
create
<
double4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
posDelta
=
CudaArray
::
create
<
double4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
double4
>
deltas
(
posDelta
->
getSize
(),
make_double4
(
0.0
,
0.0
,
0.0
,
0.0
));
vector
<
double4
>
deltas
(
posDelta
->
getSize
(),
make_double4
(
0.0
,
0.0
,
0.0
,
0.0
));
posDelta
->
upload
(
deltas
);
posDelta
->
upload
(
deltas
);
stepSize
=
CudaArray
::
create
<
double2
>
(
context
,
1
,
"stepSize"
);
stepSize
=
CudaArray
::
create
<
double2
>
(
context
,
1
,
"stepSize"
);
vector
<
double2
>
step
(
1
,
make_double2
(
0.0
,
0.0
));
stepSize
->
upload
(
&
lastStepSize
);
stepSize
->
upload
(
step
);
}
}
else
{
else
{
posDelta
=
CudaArray
::
create
<
float4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
posDelta
=
CudaArray
::
create
<
float4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
float4
>
deltas
(
posDelta
->
getSize
(),
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
));
vector
<
float4
>
deltas
(
posDelta
->
getSize
(),
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
));
posDelta
->
upload
(
deltas
);
posDelta
->
upload
(
deltas
);
stepSize
=
CudaArray
::
create
<
float2
>
(
context
,
1
,
"stepSize"
);
stepSize
=
CudaArray
::
create
<
float2
>
(
context
,
1
,
"stepSize"
);
vector
<
float2
>
step
(
1
,
make_float2
(
0.0
f
,
0.0
f
)
)
;
float2
lastStepSizeFloat
=
make_float2
(
0.0
f
,
0.0
f
);
stepSize
->
upload
(
step
);
stepSize
->
upload
(
&
lastStepSizeFloat
);
}
}
// Record the set of constraints and how many constraints each atom is involved in.
// Record the set of constraints and how many constraints each atom is involved in.
...
@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
...
@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
vsiteLocalCoordsParams
;
delete
vsiteLocalCoordsParams
;
}
}
void
CudaIntegrationUtilities
::
setNextStepSize
(
double
size
)
{
if
(
size
!=
lastStepSize
.
x
||
size
!=
lastStepSize
.
y
)
{
lastStepSize
=
make_double2
(
size
,
size
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
stepSize
->
upload
(
&
lastStepSize
);
else
{
float2
lastStepSizeFloat
=
make_float2
((
float
)
size
,
(
float
)
size
);
stepSize
->
upload
(
&
lastStepSizeFloat
);
}
}
}
double
CudaIntegrationUtilities
::
getLastStepSize
()
{
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
stepSize
->
download
(
&
lastStepSize
);
else
{
float2
lastStepSizeFloat
;
stepSize
->
download
(
&
lastStepSizeFloat
);
lastStepSize
=
make_double2
(
lastStepSizeFloat
.
x
,
lastStepSizeFloat
.
y
);
}
return
lastStepSize
.
y
;
}
void
CudaIntegrationUtilities
::
applyConstraints
(
double
tol
)
{
void
CudaIntegrationUtilities
::
applyConstraints
(
double
tol
)
{
applyConstraints
(
false
,
tol
);
applyConstraints
(
false
,
tol
);
}
}
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
2d2f05ce
...
@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
...
@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
if
(
includeEnergy
)
{
if
(
includeEnergy
)
{
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
double
*
energy
=
(
double
*
)
cu
.
getPinnedBuffer
();
double
*
energy
=
(
double
*
)
cu
.
getPinnedBuffer
();
energyArray
.
download
(
energy
);
energyArray
.
download
(
energy
);
for
(
int
i
=
0
;
i
<
energyArray
.
getSize
();
i
++
)
for
(
int
i
=
0
;
i
<
energyArray
.
getSize
();
i
++
)
...
@@ -1458,16 +1458,24 @@ private:
...
@@ -1458,16 +1458,24 @@ private:
class
CudaCalcNonbondedForceKernel
::
SyncStreamPostComputation
:
public
CudaContext
::
ForcePostComputation
{
class
CudaCalcNonbondedForceKernel
::
SyncStreamPostComputation
:
public
CudaContext
::
ForcePostComputation
{
public:
public:
SyncStreamPostComputation
(
CudaContext
&
cu
,
CUevent
event
,
int
forceGroup
)
:
cu
(
cu
),
event
(
event
),
forceGroup
(
forceGroup
)
{
SyncStreamPostComputation
(
CudaContext
&
cu
,
CUevent
event
,
CUfunction
addEnergyKernel
,
CudaArray
&
pmeEnergyBuffer
,
int
forceGroup
)
:
cu
(
cu
),
event
(
event
),
addEnergyKernel
(
addEnergyKernel
),
pmeEnergyBuffer
(
pmeEnergyBuffer
),
forceGroup
(
forceGroup
)
{
}
}
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
if
((
groups
&
(
1
<<
forceGroup
))
!=
0
)
if
((
groups
&
(
1
<<
forceGroup
))
!=
0
)
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
if
(
includeEnergy
)
{
int
bufferSize
=
pmeEnergyBuffer
.
getSize
();
void
*
args
[]
=
{
&
pmeEnergyBuffer
.
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
bufferSize
};
cu
.
executeKernel
(
addEnergyKernel
,
args
,
bufferSize
);
}
return
0.0
;
return
0.0
;
}
}
private:
private:
CudaContext
&
cu
;
CudaContext
&
cu
;
CUevent
event
;
CUevent
event
;
CUfunction
addEnergyKernel
;
CudaArray
&
pmeEnergyBuffer
;
int
forceGroup
;
int
forceGroup
;
};
};
...
@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeAtomRange
;
delete
pmeAtomRange
;
if
(
pmeAtomGridIndex
!=
NULL
)
if
(
pmeAtomGridIndex
!=
NULL
)
delete
pmeAtomGridIndex
;
delete
pmeAtomGridIndex
;
if
(
pmeEnergyBuffer
!=
NULL
)
delete
pmeEnergyBuffer
;
if
(
sort
!=
NULL
)
if
(
sort
!=
NULL
)
delete
sort
;
delete
sort
;
if
(
fft
!=
NULL
)
if
(
fft
!=
NULL
)
...
@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
int
energyElementSize
=
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeEnergyBuffer
=
new
CudaArray
(
cu
,
cu
.
getNumThreadBlocks
()
*
CudaContext
::
ThreadBlockSize
,
energyElementSize
,
"pmeEnergyBuffer"
);
cu
.
clearBuffer
(
*
pmeEnergyBuffer
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
int
cufftVersion
;
int
cufftVersion
;
cufftGetVersion
(
&
cufftVersion
);
cufftGetVersion
(
&
cufftVersion
);
...
@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
recipForceGroup
<
0
)
if
(
recipForceGroup
<
0
)
recipForceGroup
=
force
.
getForceGroup
();
recipForceGroup
=
force
.
getForceGroup
();
cu
.
addPreComputation
(
new
SyncStreamPreComputation
(
cu
,
pmeStream
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPreComputation
(
new
SyncStreamPreComputation
(
cu
,
pmeStream
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
cu
,
pmeSyncEvent
,
recipForceGroup
));
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
cu
,
pmeSyncEvent
,
cu
.
getKernel
(
module
,
"addEnergy"
),
*
pmeEnergyBuffer
,
recipForceGroup
));
}
}
hasInitializedFFT
=
true
;
hasInitializedFFT
=
true
;
...
@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
}
}
if
(
includeEnergy
)
{
if
(
includeEnergy
)
{
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
usePmeStream
?
&
pmeEnergyBuffer
->
getDevicePointer
()
:
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeEvalEnergyKernel
,
computeEnergyArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeEvalEnergyKernel
,
computeEnergyArgs
,
cu
.
getNumAtoms
());
...
@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
...
@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
verlet
,
defines
,
""
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
verlet
,
defines
,
""
);
kernel1
=
cu
.
getKernel
(
module
,
"integrateVerletPart1"
);
kernel1
=
cu
.
getKernel
(
module
,
"integrateVerletPart1"
);
kernel2
=
cu
.
getKernel
(
module
,
"integrateVerletPart2"
);
kernel2
=
cu
.
getKernel
(
module
,
"integrateVerletPart2"
);
prevStepSize
=
-
1.0
;
}
}
void
CudaIntegrateVerletStepKernel
::
execute
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
)
{
void
CudaIntegrateVerletStepKernel
::
execute
(
ContextImpl
&
context
,
const
VerletIntegrator
&
integrator
)
{
...
@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
...
@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
int
numAtoms
=
cu
.
getNumAtoms
();
int
numAtoms
=
cu
.
getNumAtoms
();
int
paddedNumAtoms
=
cu
.
getPaddedNumAtoms
();
int
paddedNumAtoms
=
cu
.
getPaddedNumAtoms
();
double
dt
=
integrator
.
getStepSize
();
double
dt
=
integrator
.
getStepSize
();
if
(
dt
!=
prevStepSize
)
{
cu
.
getIntegrationUtilities
().
setNextStepSize
(
dt
);
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
())
{
vector
<
double2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
make_double2
(
dt
,
dt
);
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
else
{
vector
<
float2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
make_float2
((
float
)
dt
,
(
float
)
dt
);
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
prevStepSize
=
dt
;
}
// Call the first integration kernel.
// Call the first integration kernel.
...
@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
...
@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
double
temperature
=
integrator
.
getTemperature
();
double
temperature
=
integrator
.
getTemperature
();
double
friction
=
integrator
.
getFriction
();
double
friction
=
integrator
.
getFriction
();
double
stepSize
=
integrator
.
getStepSize
();
double
stepSize
=
integrator
.
getStepSize
();
cu
.
getIntegrationUtilities
().
setNextStepSize
(
stepSize
);
if
(
temperature
!=
prevTemp
||
friction
!=
prevFriction
||
stepSize
!=
prevStepSize
)
{
if
(
temperature
!=
prevTemp
||
friction
!=
prevFriction
||
stepSize
!=
prevStepSize
)
{
// Calculate the integration parameters.
// Calculate the integration parameters.
...
@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
...
@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p
[
1
]
=
fscale
;
p
[
1
]
=
fscale
;
p
[
2
]
=
noisescale
;
p
[
2
]
=
noisescale
;
params
->
upload
(
p
);
params
->
upload
(
p
);
double2
ss
=
make_double2
(
0
,
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
}
else
{
else
{
vector
<
float
>
p
(
params
->
getSize
());
vector
<
float
>
p
(
params
->
getSize
());
...
@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
...
@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p
[
1
]
=
(
float
)
fscale
;
p
[
1
]
=
(
float
)
fscale
;
p
[
2
]
=
(
float
)
noisescale
;
p
[
2
]
=
(
float
)
noisescale
;
params
->
upload
(
p
);
params
->
upload
(
p
);
float2
ss
=
make_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
}
prevTemp
=
temperature
;
prevTemp
=
temperature
;
prevFriction
=
friction
;
prevFriction
=
friction
;
...
@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
...
@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Update the time and step count.
// Update the time and step count.
double
dt
,
time
;
double
dt
=
cu
.
getIntegrationUtilities
().
getLastStepSize
();
double
time
=
cu
.
getTime
()
+
dt
;
if
(
useDouble
)
{
if
(
useDouble
)
{
double2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
time
=
maxTime
;
// Avoid round-off error
}
}
else
{
else
{
float2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
time
=
maxTime
;
// Avoid round-off error
}
}
...
@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
...
@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Update the time and step count.
// Update the time and step count.
double
dt
,
time
;
double
dt
=
cu
.
getIntegrationUtilities
().
getLastStepSize
();
double
time
=
cu
.
getTime
()
+
dt
;
if
(
useDouble
)
{
if
(
useDouble
)
{
double2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
time
=
maxTime
;
// Avoid round-off error
}
}
else
{
else
{
float2
stepSize
;
cu
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cu
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
time
=
maxTime
;
// Avoid round-off error
}
}
...
@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
...
@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
summedValue
=
new
CudaArray
(
cu
,
1
,
elementSize
,
"summedValue"
);
summedValue
=
new
CudaArray
(
cu
,
1
,
elementSize
,
"summedValue"
);
perDofValues
=
new
CudaParameterSet
(
cu
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
());
perDofValues
=
new
CudaParameterSet
(
cu
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
());
cu
.
addReorderListener
(
new
ReorderListener
(
cu
,
*
perDofValues
,
localPerDofValuesFloat
,
localPerDofValuesDouble
,
deviceValuesAreCurrent
));
cu
.
addReorderListener
(
new
ReorderListener
(
cu
,
*
perDofValues
,
localPerDofValuesFloat
,
localPerDofValuesDouble
,
deviceValuesAreCurrent
));
prevStepSize
=
-
1.0
;
SimTKOpenMMUtilities
::
setRandomNumberSeed
(
integrator
.
getRandomNumberSeed
());
SimTKOpenMMUtilities
::
setRandomNumberSeed
(
integrator
.
getRandomNumberSeed
());
}
}
...
@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
...
@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
}
}
localValuesAreCurrent
=
false
;
localValuesAreCurrent
=
false
;
double
stepSize
=
integrator
.
getStepSize
();
double
stepSize
=
integrator
.
getStepSize
();
if
(
stepSize
!=
prevStepSize
)
{
recordGlobalValue
(
stepSize
,
GlobalTarget
(
DT
,
dtVariableIndex
));
recordGlobalValue
(
stepSize
,
GlobalTarget
(
DT
,
dtVariableIndex
));
}
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
globalValuesDouble
[
parameterVariableIndex
[
i
]])
{
if
(
value
!=
globalValuesDouble
[
parameterVariableIndex
[
i
]])
{
...
@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
...
@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
void
CudaIntegrateCustomStepKernel
::
recordGlobalValue
(
double
value
,
GlobalTarget
target
)
{
void
CudaIntegrateCustomStepKernel
::
recordGlobalValue
(
double
value
,
GlobalTarget
target
)
{
switch
(
target
.
type
)
{
switch
(
target
.
type
)
{
case
DT
:
case
DT
:
if
(
value
!=
globalValuesDouble
[
dtVariableIndex
])
deviceGlobalsAreCurrent
=
false
;
globalValuesDouble
[
dtVariableIndex
]
=
value
;
globalValuesDouble
[
dtVariableIndex
]
=
value
;
deviceGlobalsAreCurrent
=
false
;
cu
.
getIntegrationUtilities
().
setNextStepSize
(
value
);
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
())
{
double
size
[]
=
{
0
,
value
};
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
size
);
}
else
{
float
size
[]
=
{
0
,
(
float
)
value
};
cu
.
getIntegrationUtilities
().
getStepSize
().
upload
(
size
);
}
prevStepSize
=
value
;
break
;
break
;
case
VARIABLE
:
case
VARIABLE
:
case
PARAMETER
:
case
PARAMETER
:
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
2d2f05ce
...
@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
...
@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
}
}
CudaPlatform
::
PlatformData
::
PlatformData
(
ContextImpl
*
context
,
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
CudaPlatform
::
PlatformData
::
PlatformData
(
ContextImpl
*
context
,
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
const
string
&
cpuPmeProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
,
const
string
&
hostCompilerProperty
)
:
context
(
context
),
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
const
string
&
cpuPmeProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
,
const
string
&
hostCompilerProperty
)
:
context
(
context
),
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
,
hasInitializedContexts
(
false
)
{
bool
blocking
=
(
blockingProperty
==
"true"
);
bool
blocking
=
(
blockingProperty
==
"true"
);
vector
<
string
>
devices
;
vector
<
string
>
devices
;
size_t
searchPos
=
0
,
nextPos
;
size_t
searchPos
=
0
,
nextPos
;
...
@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
...
@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
if
(
hasInitializedContexts
)
return
;
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
contexts
[
i
]
->
initialize
();
hasInitializedContexts
=
true
;
}
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
...
...
platforms/cuda/src/kernels/customCentroidBond.cu
View file @
2d2f05ce
...
@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
...
@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
/**
* Compute the forces on groups based on the bonds.
* Compute the forces on groups based on the bonds.
*/
*/
extern
"C"
__global__
void
computeGroupForces
(
unsigned
long
long
*
__restrict__
groupForce
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
centerPositions
,
extern
"C"
__global__
void
computeGroupForces
(
unsigned
long
long
*
__restrict__
groupForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
centerPositions
,
const
int
*
__restrict__
bondGroups
const
int
*
__restrict__
bondGroups
EXTRA_ARGS
)
{
EXTRA_ARGS
)
{
real
energy
=
0
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_BONDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_BONDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
COMPUTE_FORCE
COMPUTE_FORCE
}
}
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
2d2f05ce
...
@@ -13,7 +13,7 @@ typedef struct {
...
@@ -13,7 +13,7 @@ typedef struct {
/**
/**
* Compute a force based on pair interactions.
* Compute a force based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
...
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
mixed
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
// First loop: process tiles that contain exclusions.
...
...
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
View file @
2d2f05ce
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
*/
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
mixed
energy
=
0
;
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the derivatives
// Load the derivatives
...
...
platforms/cuda/src/kernels/customHbondForce.cu
View file @
2d2f05ce
...
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
...
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
/**
* Compute forces on donors.
* Compute forces on donors.
*/
*/
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
extern
__shared__
real4
posBuffer
[];
real
energy
=
0
;
mixed
energy
=
0
;
real3
f1
=
make_real3
(
0
);
real3
f1
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
...
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
...
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
/**
/**
* Compute forces on acceptors.
* Compute forces on acceptors.
*/
*/
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
2d2f05ce
...
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
...
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
* Compute the interaction.
* Compute the interaction.
*/
*/
extern
"C"
__global__
void
computeInteraction
(
extern
"C"
__global__
void
computeInteraction
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
...
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
...
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
.0
f
;
mixed
energy
=
0
;
// Loop over particles to be the first one in the set.
// Loop over particles to be the first one in the set.
...
...
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
2d2f05ce
...
@@ -9,14 +9,14 @@ typedef struct {
...
@@ -9,14 +9,14 @@ typedef struct {
}
AtomData
;
}
AtomData
;
extern
"C"
__global__
void
computeInteractionGroups
(
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
real
energy
=
0
.0
f
;
mixed
energy
=
0
;
__shared__
AtomData
localData
[
LOCAL_MEMORY_SIZE
];
__shared__
AtomData
localData
[
LOCAL_MEMORY_SIZE
];
const
unsigned
int
startTile
=
FIRST_TILE
+
warp
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
;
const
unsigned
int
startTile
=
FIRST_TILE
+
warp
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
;
...
...
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment