Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
68f8aed8
Commit
68f8aed8
authored
Jun 24, 2013
by
Peter
Browse files
Merge branch 'pme' of
https://github.com/peastman/openmm
into pme
parents
84acc13d
5e0b0e3a
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1310 additions
and
120 deletions
+1310
-120
cmake_modules/FindFFTW.cmake
cmake_modules/FindFFTW.cmake
+25
-0
olla/include/openmm/kernels.h
olla/include/openmm/kernels.h
+64
-0
platforms/cuda/include/CudaPlatform.h
platforms/cuda/include/CudaPlatform.h
+2
-1
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+12
-0
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+59
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+184
-109
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+6
-1
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+3
-3
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+13
-3
platforms/cuda/tests/TestCudaRandom.cpp
platforms/cuda/tests/TestCudaRandom.cpp
+1
-1
platforms/cuda/tests/TestCudaSort.cpp
platforms/cuda/tests/TestCudaSort.cpp
+1
-1
plugins/cpupme/CMakeLists.txt
plugins/cpupme/CMakeLists.txt
+92
-0
plugins/cpupme/include/internal/windowsExportPme.h
plugins/cpupme/include/internal/windowsExportPme.h
+41
-0
plugins/cpupme/src/CpuPmeKernelFactory.cpp
plugins/cpupme/src/CpuPmeKernelFactory.cpp
+54
-0
plugins/cpupme/src/CpuPmeKernelFactory.h
plugins/cpupme/src/CpuPmeKernelFactory.h
+50
-0
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+614
-0
plugins/cpupme/src/CpuPmeKernels.h
plugins/cpupme/src/CpuPmeKernels.h
+89
-0
No files found.
cmake_modules/FindFFTW.cmake
0 → 100644
View file @
68f8aed8
# - Find FFTW
# Find the native FFTW includes and library
#
# FFTW_INCLUDES - where to find fftw3.h
# FFTW_LIBRARY - the main FFTW library.
# FFTW_THREADS_LIBRARY - the FFTW multithreading support library.
# FFTW_FOUND - True if FFTW found.
if
(
FFTW_INCLUDES
)
# Already in cache, be silent
set
(
FFTW_FIND_QUIETLY TRUE
)
endif
(
FFTW_INCLUDES
)
find_path
(
FFTW_INCLUDES fftw3.h
)
find_library
(
FFTW_LIBRARY NAMES fftw3f
)
find_library
(
FFTW_THREADS_LIBRARY NAMES fftw3f_threads
)
# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
# all listed variables are TRUE
include
(
FindPackageHandleStandardArgs
)
find_package_handle_standard_args
(
FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES
)
find_package_handle_standard_args
(
FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES
)
mark_as_advanced
(
FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES
)
olla/include/openmm/kernels.h
View file @
68f8aed8
...
...
@@ -1152,6 +1152,70 @@ public:
virtual
void
execute
(
ContextImpl
&
context
)
=
0
;
};
/**
* This kernel performs the reciprocal space calculation for PME. In most cases, this
* calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
* In some cases it may want to outsource the work to a different kernel. In particular,
* GPU based platforms sometimes use a CPU based implementation provided by a separate
* plugin.
*/
class
CalcPmeReciprocalForceKernel
:
public
KernelImpl
{
public:
class
IO
;
static
std
::
string
Name
()
{
return
"CalcPmeReciprocalForce"
;
}
CalcPmeReciprocalForceKernel
(
std
::
string
name
,
const
Platform
&
platform
)
:
KernelImpl
(
name
,
platform
)
{
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
virtual
void
initialize
(
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
double
alpha
)
=
0
;
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxSize the size of the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
virtual
void
beginComputation
(
IO
&
io
,
Vec3
periodicBoxSize
,
bool
includeEnergy
)
=
0
;
/**
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
virtual
double
finishComputation
(
IO
&
io
)
=
0
;
};
/**
* Any class that uses CalcPmeReciprocalForceKernel should create an implementation of this
* class, then pass it to the kernel to manage communication with it.
*/
class
CalcPmeReciprocalForceKernel
::
IO
{
public:
/**
* Get a pointer to the atom charges and positions. This array should contain four
* elements for each atom: x, y, z, and q in that order.
*/
virtual
float
*
getPosq
()
=
0
;
/**
* Record the forces calculated by the kernel.
*
* @param force an array containing four elements for each atom. The first three
* are the x, y, and z components of the force, while the fourth element
* should be ignored.
*/
virtual
void
setForce
(
float
*
force
)
=
0
;
};
}
// namespace OpenMM
#endif
/*OPENMM_KERNELS_H_*/
platforms/cuda/include/CudaPlatform.h
View file @
68f8aed8
...
...
@@ -99,11 +99,12 @@ public:
class
OPENMM_EXPORT_CUDA
CudaPlatform
::
PlatformData
{
public:
PlatformData
(
const
System
&
system
,
const
std
::
string
&
deviceIndexProperty
,
const
std
::
string
&
blockingProperty
,
const
std
::
string
&
precisionProperty
,
PlatformData
(
ContextImpl
*
context
,
const
System
&
system
,
const
std
::
string
&
deviceIndexProperty
,
const
std
::
string
&
blockingProperty
,
const
std
::
string
&
precisionProperty
,
const
std
::
string
&
compilerProperty
,
const
std
::
string
&
tempProperty
);
~
PlatformData
();
void
initializeContexts
(
const
System
&
system
);
void
syncContexts
();
ContextImpl
*
context
;
std
::
vector
<
CudaContext
*>
contexts
;
std
::
vector
<
double
>
contextEnergy
;
bool
removeCM
,
peerAccessSupported
;
...
...
platforms/cuda/src/CudaContext.cpp
View file @
68f8aed8
...
...
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
delete
forces
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
delete
reorderListeners
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
preComputations
.
size
();
i
++
)
delete
preComputations
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
postComputations
.
size
();
i
++
)
delete
postComputations
[
i
];
if
(
pinnedBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedBuffer
);
if
(
posq
!=
NULL
)
...
...
@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
reorderListeners
.
push_back
(
listener
);
}
void
CudaContext
::
addPreComputation
(
ForcePreComputation
*
computation
)
{
preComputations
.
push_back
(
computation
);
}
void
CudaContext
::
addPostComputation
(
ForcePostComputation
*
computation
)
{
postComputations
.
push_back
(
computation
);
}
struct
CudaContext
::
WorkThread
::
ThreadData
{
ThreadData
(
std
::
queue
<
CudaContext
::
WorkTask
*>&
tasks
,
bool
&
waiting
,
bool
&
finished
,
pthread_mutex_t
&
queueLock
,
pthread_cond_t
&
waitForTaskCondition
,
pthread_cond_t
&
queueEmptyCondition
)
:
...
...
platforms/cuda/src/CudaContext.h
View file @
68f8aed8
...
...
@@ -70,6 +70,8 @@ public:
class
WorkTask
;
class
WorkThread
;
class
ReorderListener
;
class
ForcePreComputation
;
class
ForcePostComputation
;
static
const
int
ThreadBlockSize
;
static
const
int
TileSize
;
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
...
...
@@ -454,6 +456,28 @@ public:
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
}
/**
* Add a pre-computation that should be called at the very start of force and energy evalutations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addPreComputation
(
ForcePreComputation
*
computation
);
/**
* Get the list of ForcePreComputations.
*/
std
::
vector
<
ForcePreComputation
*>&
getPreComputations
()
{
return
preComputations
;
}
/**
* Add a post-computation that should be called at the very end of force and energy evalutations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addPostComputation
(
ForcePostComputation
*
computation
);
/**
* Get the list of ForcePostComputations.
*/
std
::
vector
<
ForcePostComputation
*>&
getPostComputations
()
{
return
postComputations
;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
...
...
@@ -519,6 +543,8 @@ private:
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
std
::
vector
<
ForcePreComputation
*>
preComputations
;
std
::
vector
<
ForcePostComputation
*>
postComputations
;
CudaIntegrationUtilities
*
integration
;
CudaExpressionUtilities
*
expression
;
CudaBondedUtilities
*
bonded
;
...
...
@@ -580,7 +606,7 @@ private:
/**
* This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a
r
eorderListener
* Objects that need to know when reordering happens should create a
R
eorderListener
* and register it by calling addReorderListener().
*/
class
CudaContext
::
ReorderListener
{
...
...
@@ -590,6 +616,38 @@ public:
}
};
/**
* This abstract class defines a function to be executed at the very beginning of force and
* energy evaluation, before any other calculation has been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePreComputation, register it by calling addForcePreComputation().
*/
class
CudaContext
::
ForcePreComputation
{
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
*/
virtual
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
=
0
;
};
/**
* This abstract class defines a function to be executed at the very end of force and
* energy evaluation, after all other calculations have been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePostComputation, register it by calling addForcePostComputation().
*/
class
CudaContext
::
ForcePostComputation
{
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
* @return an optional contribution to add to the potential energy. */
virtual
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
=
0
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CUDACONTEXT_H_*/
platforms/cuda/src/CudaKernels.cpp
View file @
68f8aed8
...
...
@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
void
CudaCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
cu
.
setAsCurrent
();
cu
.
clearAutoclearBuffers
();
for
(
vector
<
CudaContext
::
ForcePreComputation
*>::
iterator
iter
=
cu
.
getPreComputations
().
begin
();
iter
!=
cu
.
getPreComputations
().
end
();
++
iter
)
(
*
iter
)
->
computeForceAndEnergy
(
includeForces
,
includeEnergy
,
groups
);
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
bool
includeNonbonded
=
((
groups
&
(
1
<<
nb
.
getForceGroup
()))
!=
0
);
cu
.
setComputeForceCount
(
cu
.
getComputeForceCount
()
+
1
);
cu
.
clearAutoclearBuffers
();
if
(
includeNonbonded
)
nb
.
prepareInteractions
();
}
...
...
@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu
.
getBondedUtilities
().
computeInteractions
(
groups
);
if
((
groups
&
(
1
<<
cu
.
getNonbondedUtilities
().
getForceGroup
()))
!=
0
)
cu
.
getNonbondedUtilities
().
computeInteractions
();
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
double
sum
=
0.0
;
for
(
vector
<
CudaContext
::
ForcePostComputation
*>::
iterator
iter
=
cu
.
getPostComputations
().
begin
();
iter
!=
cu
.
getPostComputations
().
end
();
++
iter
)
sum
+=
(
*
iter
)
->
computeForceAndEnergy
(
includeForces
,
includeEnergy
,
groups
);
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
if
(
includeEnergy
)
{
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
if
(
cu
.
getUseDoublePrecision
())
{
...
...
@@ -1330,6 +1334,59 @@ private:
const
NonbondedForce
&
force
;
};
class
CudaCalcNonbondedForceKernel
::
PmeIO
:
public
CalcPmeReciprocalForceKernel
::
IO
{
public:
PmeIO
(
CudaContext
&
cu
,
CUfunction
addForcesKernel
)
:
cu
(
cu
),
addForcesKernel
(
addForcesKernel
),
forceTemp
(
NULL
)
{
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
));
forceTemp
=
new
CudaArray
(
cu
,
cu
.
getNumAtoms
(),
elementSize
,
"PmeForce"
);
}
~
PmeIO
()
{
if
(
forceTemp
!=
NULL
)
delete
forceTemp
;
}
float
*
getPosq
()
{
cu
.
setAsCurrent
();
cu
.
getPosq
().
download
(
posq
);
return
(
float
*
)
&
posq
[
0
];
}
void
setForce
(
float
*
force
)
{
forceTemp
->
upload
(
force
);
void
*
args
[]
=
{
&
forceTemp
->
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
()};
cu
.
executeKernel
(
addForcesKernel
,
args
,
cu
.
getNumAtoms
());
}
private:
CudaContext
&
cu
;
vector
<
float4
>
posq
;
CudaArray
*
forceTemp
;
CUfunction
addForcesKernel
;
};
class
CudaCalcNonbondedForceKernel
::
PmePreComputation
:
public
CudaContext
::
ForcePreComputation
{
public:
PmePreComputation
(
CudaContext
&
cu
,
Kernel
&
pme
,
CalcPmeReciprocalForceKernel
::
IO
&
io
)
:
cu
(
cu
),
pme
(
pme
),
io
(
io
)
{
}
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
Vec3
boxSize
(
cu
.
getPeriodicBoxSize
().
x
,
cu
.
getPeriodicBoxSize
().
y
,
cu
.
getPeriodicBoxSize
().
z
);
pme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
boxSize
,
includeEnergy
);
}
private:
CudaContext
&
cu
;
Kernel
pme
;
CalcPmeReciprocalForceKernel
::
IO
&
io
;
};
class
CudaCalcNonbondedForceKernel
::
PmePostComputation
:
public
CudaContext
::
ForcePostComputation
{
public:
PmePostComputation
(
Kernel
&
pme
,
CalcPmeReciprocalForceKernel
::
IO
&
io
)
:
pme
(
pme
),
io
(
io
)
{
}
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
return
pme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
finishComputation
(
io
);
}
private:
Kernel
pme
;
CalcPmeReciprocalForceKernel
::
IO
&
io
;
};
CudaCalcNonbondedForceKernel
::~
CudaCalcNonbondedForceKernel
()
{
cu
.
setAsCurrent
();
if
(
sigmaEpsilon
!=
NULL
)
...
...
@@ -1354,6 +1411,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeAtomGridIndex
;
if
(
sort
!=
NULL
)
delete
sort
;
if
(
pmeio
!=
NULL
)
delete
pmeio
;
if
(
hasInitializedFFT
)
{
cufftDestroy
(
fftForward
);
cufftDestroy
(
fftBackward
);
...
...
@@ -1457,7 +1516,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
else
dispersionCoefficient
=
0.0
;
alpha
=
0
;
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
Ewald
)
{
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
Ewald
&&
cu
.
getContextIndex
()
==
0
)
{
// Compute the Ewald parameters.
int
kmaxx
,
kmaxy
,
kmaxz
;
...
...
@@ -1465,7 +1524,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"USE_EWALD"
]
=
"1"
;
ewaldSelfEnergy
=
(
cu
.
getContextIndex
()
==
0
?
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
)
:
0.0
)
;
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
// Create the reciprocal space kernels.
...
...
@@ -1484,7 +1543,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double2
)
:
sizeof
(
float2
));
cosSinSums
=
new
CudaArray
(
cu
,
(
2
*
kmaxx
-
1
)
*
(
2
*
kmaxy
-
1
)
*
(
2
*
kmaxz
-
1
),
elementSize
,
"cosSinSums"
);
}
else
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
PME
)
{
else
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
PME
&&
cu
.
getContextIndex
()
==
0
)
{
// Compute the PME parameters.
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
...
...
@@ -1497,7 +1556,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"USE_EWALD"
]
=
"1"
;
ewaldSelfEnergy
=
(
cu
.
getContextIndex
()
==
0
?
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
)
:
0.0
)
;
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
pmeDefines
[
"PME_ORDER"
]
=
cu
.
intToString
(
PmeOrder
);
pmeDefines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numParticles
);
pmeDefines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
...
...
@@ -1510,6 +1569,21 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
cu
.
getUseDoublePrecision
())
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
bool
useCpuPme
=
true
;
if
(
useCpuPme
)
{
try
{
cpuPme
=
getPlatform
().
createKernel
(
CalcPmeReciprocalForceKernel
::
Name
(),
*
cu
.
getPlatformData
().
context
);
cpuPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
initialize
(
gridSizeX
,
gridSizeY
,
gridSizeZ
,
numParticles
,
alpha
);
CUfunction
addForcesKernel
=
cu
.
getKernel
(
module
,
"addForces"
);
pmeio
=
new
PmeIO
(
cu
,
addForcesKernel
);
cu
.
addPreComputation
(
new
PmePreComputation
(
cu
,
cpuPme
,
*
pmeio
));
cu
.
addPostComputation
(
new
PmePostComputation
(
cpuPme
,
*
pmeio
));
}
catch
(
OpenMMException
&
ex
)
{
// The CPU PME plugin isn't available.
}
}
if
(
pmeio
==
NULL
)
{
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
...
...
@@ -1618,6 +1692,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
}
}
}
else
ewaldSelfEnergy
=
0.0
;
...
...
@@ -1654,13 +1729,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
double
CudaCalcNonbondedForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
bool
includeDirect
,
bool
includeReciprocal
)
{
if
(
cosSinSums
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
cosSinSums
!=
NULL
&&
includeReciprocal
)
{
void
*
sumsArgs
[]
=
{
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
ewaldSumsKernel
,
sumsArgs
,
cosSinSums
->
getSize
());
void
*
forcesArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
if
(
directPmeGrid
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
if
(
directPmeGrid
!=
NULL
&&
includeReciprocal
)
{
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
...
...
@@ -1699,7 +1775,6 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
128
);
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
double4
boxSize
=
cu
.
getPeriodicBoxSize
();
energy
+=
dispersionCoefficient
/
(
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
);
...
...
platforms/cuda/src/CudaKernels.h
View file @
68f8aed8
...
...
@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
,
pmeio
(
NULL
)
{
}
~
CudaCalcNonbondedForceKernel
();
/**
...
...
@@ -596,6 +596,9 @@ private:
const
char
*
getMaxValue
()
const
{
return
"make_int2(INT_MAX, INT_MAX)"
;}
const
char
*
getSortKey
()
const
{
return
"value.y"
;}
};
class
PmeIO
;
class
PmePreComputation
;
class
PmePostComputation
;
CudaContext
&
cu
;
bool
hasInitializedFFT
;
CudaArray
*
sigmaEpsilon
;
...
...
@@ -609,6 +612,8 @@ private:
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
Kernel
cpuPme
;
PmeIO
*
pmeio
;
cufftHandle
fftForward
;
cufftHandle
fftBackward
;
CUfunction
ewaldSumsKernel
;
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
68f8aed8
...
...
@@ -147,7 +147,7 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
getPropertyDefaultValue
(
CudaTempDirectory
())
:
properties
.
find
(
CudaTempDirectory
())
->
second
);
transform
(
blockingPropValue
.
begin
(),
blockingPropValue
.
end
(),
blockingPropValue
.
begin
(),
::
tolower
);
transform
(
precisionPropValue
.
begin
(),
precisionPropValue
.
end
(),
precisionPropValue
.
begin
(),
::
tolower
);
context
.
setPlatformData
(
new
PlatformData
(
context
.
getSystem
(),
devicePropValue
,
blockingPropValue
,
precisionPropValue
,
compilerPropValue
,
tempPropValue
));
context
.
setPlatformData
(
new
PlatformData
(
&
context
,
context
.
getSystem
(),
devicePropValue
,
blockingPropValue
,
precisionPropValue
,
compilerPropValue
,
tempPropValue
));
}
void
CudaPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
...
...
@@ -155,8 +155,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
delete
data
;
}
CudaPlatform
::
PlatformData
::
PlatformData
(
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
)
:
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
CudaPlatform
::
PlatformData
::
PlatformData
(
ContextImpl
*
context
,
const
System
&
system
,
const
string
&
deviceIndexProperty
,
const
string
&
blockingProperty
,
const
string
&
precisionProperty
,
const
string
&
compilerProperty
,
const
string
&
tempProperty
)
:
context
(
context
),
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
bool
blocking
=
(
blockingProperty
==
"true"
);
vector
<
string
>
devices
;
size_t
searchPos
=
0
,
nextPos
;
...
...
platforms/cuda/src/kernels/pme.cu
View file @
68f8aed8
...
...
@@ -271,3 +271,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
z
*
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
0x100000000
));
}
}
extern
"C"
__global__
void
addForces
(
const
real4
*
__restrict__
forces
,
unsigned
long
long
*
__restrict__
forceBuffers
)
{
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
f
=
forces
[
atom
];
forceBuffers
[
atom
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
x
*
0x100000000
));
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
y
*
0x100000000
));
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
z
*
0x100000000
));
}
}
\ No newline at end of file
platforms/cuda/tests/TestCudaRandom.cpp
View file @
68f8aed8
...
...
@@ -54,7 +54,7 @@ void testGaussian() {
System
system
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
system
.
addParticle
(
1.0
);
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
),
CudaPlatform
::
PlatformData
platformData
(
NULL
,
system
,
""
,
"true"
,
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
...
...
platforms/cuda/tests/TestCudaSort.cpp
View file @
68f8aed8
...
...
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
),
CudaPlatform
::
PlatformData
platformData
(
NULL
,
system
,
""
,
"true"
,
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
...
...
plugins/cpupme/CMakeLists.txt
0 → 100644
View file @
68f8aed8
#---------------------------------------------------
# OpenMM CPU PME Plugin
#
# Creates plugin library, base name=OpenMMPME.
# Default libraries are shared & optimized.
#
# Windows:
# OpenMMPME[_d].dll
# OpenMMPME[_d].lib
# Unix:
# libOpenMMPME[_d].so
#----------------------------------------------------
IF
(
APPLE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.6"
)
ENDIF
(
APPLE
)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET
(
OPENMM_SOURCE_SUBDIRS .
)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET
(
OPENMMPME_LIBRARY_NAME OpenMMPME
)
SET
(
SHARED_TARGET
${
OPENMMPME_LIBRARY_NAME
}
)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
SET
(
CMAKE_DEBUG_POSTFIX
"_d"
CACHE INTERNAL
""
FORCE
)
ENDIF
(
${
CMAKE_GENERATOR
}
MATCHES
"Visual Studio"
)
# But on Unix or Cygwin we have to add the suffix manually
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
SHARED_TARGET
${
SHARED_TARGET
}
_d
)
SET
(
STATIC_TARGET
${
STATIC_TARGET
}
_d
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
# These are all the places to search for header files which are
# to be part of the API.
SET
(
API_INCLUDE_DIRS
)
# start empty
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
# append
SET
(
API_INCLUDE_DIRS
${
API_INCLUDE_DIRS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include/internal
)
ENDFOREACH
(
subdir
)
# Find the include files.
SET
(
API_INCLUDE_FILES
)
FOREACH
(
dir
${
API_INCLUDE_DIRS
}
)
FILE
(
GLOB fullpaths
${
dir
}
/*.h
)
# returns full pathnames
SET
(
API_INCLUDE_FILES
${
API_INCLUDE_FILES
}
${
fullpaths
}
)
ENDFOREACH
(
dir
)
# collect up source files
SET
(
SOURCE_FILES
)
# empty
SET
(
SOURCE_INCLUDE_FILES
)
FOREACH
(
subdir
${
OPENMM_SOURCE_SUBDIRS
}
)
FILE
(
GLOB_RECURSE src_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.cpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.c
)
FILE
(
GLOB incl_files
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/src/*.h
)
SET
(
SOURCE_FILES
${
SOURCE_FILES
}
${
src_files
}
)
#append
SET
(
SOURCE_INCLUDE_FILES
${
SOURCE_INCLUDE_FILES
}
${
incl_files
}
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
# Include FFTW related files.
INCLUDE
(
FindFFTW
)
INCLUDE_DIRECTORIES
(
${
FFTW_INCLUDES
}
)
# Build the plugin library.
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_INCLUDE_FILES
}
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
_d
)
ELSE
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
PTHREADS_LIB
}
${
FFTW_LIBRARY
}
${
FFTW_THREADS_LIBRARY
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES COMPILE_FLAGS
"-DOPENMM_PME_BUILDING_SHARED_LIBRARY"
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
plugins/cpupme/include/internal/windowsExportPme.h
0 → 100644
View file @
68f8aed8
#ifndef OPENMM_WINDOWSEXPORTPME_H_
#define OPENMM_WINDOWSEXPORTPME_H_
/*
* Shared libraries are messy in Visual Studio. We have to distinguish three
* cases:
* (1) this header is being used to build the OpenMM shared library
* (dllexport)
* (2) this header is being used by a *client* of the OpenMM shared
* library (dllimport)
* (3) we are building the OpenMM static library, or the client is
* being compiled with the expectation of linking with the
* OpenMM static library (nothing special needed)
* In the CMake script for building this library, we define one of the symbols
* OPENMM_PME_BUILDING_{SHARED|STATIC}_LIBRARY
* Client code normally has no special symbol defined, in which case we'll
* assume it wants to use the shared library. However, if the client defines
* the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
* that the client code can be linked with static libraries. Note that
* the client symbol is not library dependent, while the library symbols
* affect only the OpenMM library, meaning that other libraries can
* be clients of this one. However, we are assuming all-static or all-shared.
*/
#ifdef _MSC_VER
// We don't want to hear about how sprintf is "unsafe".
#pragma warning(disable:4996)
// Keep MS VC++ quiet about lack of dll export of private members.
#pragma warning(disable:4251)
#if defined(OPENMM_PME_BUILDING_SHARED_LIBRARY)
#define OPENMM_EXPORT_PME __declspec(dllexport)
#elif defined(OPENMM_PME_BUILDING_STATIC_LIBRARY) || defined(OPENMM_PME_USE_STATIC_LIBRARIES)
#define OPENMM_EXPORT_PME
#else
#define OPENMM_EXPORT_PME __declspec(dllimport) // i.e., a client of a shared library
#endif
#else
#define OPENMM_EXPORT_PME // Linux, Mac
#endif
#endif // OPENMM_WINDOWSEXPORTPME_H_
plugins/cpupme/src/CpuPmeKernelFactory.cpp
0 → 100644
View file @
68f8aed8
/* -------------------------------------------------------------------------- *
* OpenMMAmoeba *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CpuPmeKernelFactory.h"
#include "CpuPmeKernels.h"
#include "internal/windowsExportPme.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using
namespace
OpenMM
;
extern
"C"
void
registerPlatforms
()
{
}
extern
"C"
void
registerKernelFactories
()
{
if
(
CpuCalcPmeReciprocalForceKernel
::
isProcessorSupported
())
{
CpuPmeKernelFactory
*
factory
=
new
CpuPmeKernelFactory
();
for
(
int
i
=
0
;
i
<
Platform
::
getNumPlatforms
();
i
++
)
Platform
::
getPlatform
(
i
).
registerKernelFactory
(
CalcPmeReciprocalForceKernel
::
Name
(),
factory
);
}
}
extern
"C"
OPENMM_EXPORT_PME
void
registerCpuPmeKernelFactories
()
{
registerKernelFactories
();
}
KernelImpl
*
CpuPmeKernelFactory
::
createKernelImpl
(
std
::
string
name
,
const
Platform
&
platform
,
ContextImpl
&
context
)
const
{
if
(
name
==
CalcPmeReciprocalForceKernel
::
Name
())
return
new
CpuCalcPmeReciprocalForceKernel
(
name
,
platform
);
throw
OpenMMException
((
std
::
string
(
"Tried to create kernel with illegal kernel name '"
)
+
name
+
"'"
).
c_str
());
}
plugins/cpupme/src/CpuPmeKernelFactory.h
0 → 100644
View file @
68f8aed8
#ifndef OPENMM_CPUPMEKERNELFACTORY_H_
#define OPENMM_CPUPMEKERNELFACTORY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/KernelFactory.h"
namespace
OpenMM
{
/**
* This KernelFactory creates kernels for the CPU implementation of PME.
*/
class
CpuPmeKernelFactory
:
public
KernelFactory
{
public:
KernelImpl
*
createKernelImpl
(
std
::
string
name
,
const
Platform
&
platform
,
ContextImpl
&
context
)
const
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CPUPMEKERNELFACTORY_H_*/
plugins/cpupme/src/CpuPmeKernels.cpp
0 → 100644
View file @
68f8aed8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include "CpuPmeKernels.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include <cmath>
#include <smmintrin.h>
using
namespace
OpenMM
;
using
namespace
std
;
static
const
int
PME_ORDER
=
5
;
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
static
float
extractFloat
(
__m128
v
,
unsigned
int
element
)
{
float
f
[
4
];
_mm_store_ps
(
f
,
v
);
return
f
[
element
];
}
// Define function to get the number of processors.
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <dlfcn.h>
#else
#ifdef WIN32
#include <windows.h>
#else
#include <dlfcn.h>
#include <unistd.h>
#endif
#endif
static
int
getNumProcessors
()
{
#ifdef __APPLE__
int
ncpu
;
size_t
len
=
4
;
if
(
sysctlbyname
(
"hw.logicalcpu"
,
&
ncpu
,
&
len
,
NULL
,
0
)
==
0
)
return
ncpu
;
else
return
1
;
#else
#ifdef WIN32
SYSTEM_INFO
siSysInfo
;
int
ncpu
;
GetSystemInfo
(
&
siSysInfo
);
ncpu
=
siSysInfo
.
dwNumberOfProcessors
;
if
(
ncpu
<
1
)
ncpu
=
1
;
return
ncpu
;
#else
long
nProcessorsOnline
=
sysconf
(
_SC_NPROCESSORS_ONLN
);
if
(
nProcessorsOnline
==
-
1
)
return
1
;
else
return
(
int
)
nProcessorsOnline
;
#endif
#endif
}
// Define a function to check the CPU's capabilities.
#ifdef _WIN32
#define cpuid __cpuid
#else
static
void
cpuid
(
int
cpuInfo
[
4
],
int
infoType
){
__asm__
__volatile__
(
"cpuid"
:
"=a"
(
cpuInfo
[
0
]),
"=b"
(
cpuInfo
[
1
]),
"=c"
(
cpuInfo
[
2
]),
"=d"
(
cpuInfo
[
3
])
:
"a"
(
infoType
)
);
}
#endif
static
void
spreadCharge
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
float
temp
[
4
];
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
__m128
gridSize
=
_mm_set_ps
(
0
,
gridz
,
gridy
,
gridx
);
__m128
gridSizeInt
=
_mm_set_epi32
(
0
,
gridz
,
gridy
,
gridx
);
__m128
one
=
_mm_set1_ps
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
memset
(
grid
,
0
,
sizeof
(
float
)
*
gridx
*
gridy
*
gridz
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_load_ps
(
&
posq
[
4
*
i
]);
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
_mm_floor_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
))));
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
);
__m128
ti
=
_mm_cvttps_epi32
(
t
);
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
));
__m128
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)));
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
();
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
);
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]);
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
),
dr
),
data
[
j
-
k
-
1
])));
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)),
data
[
0
]);
}
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]);
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
),
dr
),
data
[
PME_ORDER
-
j
-
1
])));
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)),
data
[
0
]);
// Spread the charges.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m128
zdata0to3
=
_mm_set_epi32
(
_mm_extract_ps
(
data
[
3
],
2
),
_mm_extract_ps
(
data
[
2
],
2
),
_mm_extract_ps
(
data
[
1
],
2
),
_mm_extract_ps
(
data
[
0
],
2
));
float
zdata4
=
extractFloat
(
data
[
4
],
2
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
charge
*
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
if
(
gridIndexZ
+
4
<
gridz
)
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
],
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add0to3
));
else
{
_mm_store_ps
(
temp
,
add0to3
);
int
zindex
=
gridIndexZ
;
grid
[
ybase
+
zindex
]
+=
temp
[
0
];
zindex
++
;
zindex
-=
(
zindex
>=
gridz
?
gridz
:
0
);
grid
[
ybase
+
zindex
]
+=
temp
[
1
];
zindex
++
;
zindex
-=
(
zindex
>=
gridz
?
gridz
:
0
);
grid
[
ybase
+
zindex
]
+=
temp
[
2
];
zindex
++
;
zindex
-=
(
zindex
>=
gridz
?
gridz
:
0
);
grid
[
ybase
+
zindex
]
+=
temp
[
3
];
}
int
zindex
=
gridIndexZ
+
4
;
zindex
-=
(
zindex
>=
gridz
?
gridz
:
0
);
grid
[
ybase
+
zindex
]
+=
multiplier
*
zdata4
;
}
}
}
}
static
float
reciprocalEnergy
(
int
start
,
int
end
,
fftwf_complex
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
double
alpha
,
vector
<
float
>*
bsplineModuli
,
Vec3
periodicBoxSize
)
{
const
unsigned
int
yzsize
=
gridy
*
gridz
;
const
unsigned
int
zsizeHalf
=
gridz
/
2
+
1
;
const
unsigned
int
yzsizeHalf
=
gridy
*
zsizeHalf
;
const
float
scaleFactor
=
(
float
)
(
M_PI
*
periodicBoxSize
[
0
]
*
periodicBoxSize
[
1
]
*
periodicBoxSize
[
2
]);
const
float
recipExpFactor
=
(
float
)
(
M_PI
*
M_PI
/
(
alpha
*
alpha
));
const
float
invPeriodicBoxSizeX
=
(
float
)
(
1.0
/
periodicBoxSize
[
0
]);
const
float
invPeriodicBoxSizeY
=
(
float
)
(
1.0
/
periodicBoxSize
[
1
]);
const
float
invPeriodicBoxSizeZ
=
(
float
)
(
1.0
/
periodicBoxSize
[
2
]);
float
energy
=
0.0
f
;
int
firstz
=
(
start
==
0
?
1
:
0
);
for
(
int
kx
=
start
;
kx
<
end
;
kx
++
)
{
int
mx
=
(
kx
<
(
gridx
+
1
)
/
2
)
?
kx
:
kx
-
gridx
;
float
mhx
=
mx
*
invPeriodicBoxSizeX
;
float
bx
=
scaleFactor
*
bsplineModuli
[
0
][
kx
];
for
(
int
ky
=
0
;
ky
<
gridy
;
ky
++
)
{
int
my
=
(
ky
<
(
gridy
+
1
)
/
2
)
?
ky
:
ky
-
gridy
;
float
mhy
=
my
*
invPeriodicBoxSizeY
;
float
mhx2y2
=
mhx
*
mhx
+
mhy
*
mhy
;
float
bxby
=
bx
*
bsplineModuli
[
1
][
ky
];
for
(
int
kz
=
firstz
;
kz
<
gridz
;
kz
++
)
{
int
mz
=
(
kz
<
(
gridz
+
1
)
/
2
)
?
kz
:
kz
-
gridz
;
float
mhz
=
mz
*
invPeriodicBoxSizeZ
;
float
bz
=
bsplineModuli
[
2
][
kz
];
float
m2
=
mhx2y2
+
mhz
*
mhz
;
float
denom
=
m2
*
bxby
*
bz
;
float
eterm
=
exp
(
-
recipExpFactor
*
m2
)
/
denom
;
int
kx1
,
ky1
,
kz1
;
if
(
kz
>=
gridz
/
2
+
1
)
{
kx1
=
(
kx
==
0
?
kx
:
gridx
-
kx
);
ky1
=
(
ky
==
0
?
ky
:
gridy
-
ky
);
kz1
=
gridz
-
kz
;
}
else
{
kx1
=
kx
;
ky1
=
ky
;
kz1
=
kz
;
}
int
index
=
kx1
*
yzsizeHalf
+
ky1
*
zsizeHalf
+
kz1
;
float
gridReal
=
grid
[
index
][
0
];
float
gridImag
=
grid
[
index
][
1
];
energy
+=
eterm
*
(
gridReal
*
gridReal
+
gridImag
*
gridImag
);
}
firstz
=
0
;
}
}
return
0.5
f
*
energy
;
}
static
void
reciprocalConvolution
(
int
start
,
int
end
,
fftwf_complex
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
double
alpha
,
vector
<
float
>*
bsplineModuli
,
Vec3
periodicBoxSize
)
{
const
unsigned
int
zsize
=
gridz
/
2
+
1
;
const
unsigned
int
yzsize
=
gridy
*
zsize
;
const
float
scaleFactor
=
(
float
)
(
M_PI
*
periodicBoxSize
[
0
]
*
periodicBoxSize
[
1
]
*
periodicBoxSize
[
2
]);
const
float
recipExpFactor
=
(
float
)
(
M_PI
*
M_PI
/
(
alpha
*
alpha
));
const
float
invPeriodicBoxSizeX
=
(
float
)
(
1.0
/
periodicBoxSize
[
0
]);
const
float
invPeriodicBoxSizeY
=
(
float
)
(
1.0
/
periodicBoxSize
[
1
]);
const
float
invPeriodicBoxSizeZ
=
(
float
)
(
1.0
/
periodicBoxSize
[
2
]);
int
firstz
=
(
start
==
0
?
1
:
0
);
for
(
int
kx
=
start
;
kx
<
end
;
kx
++
)
{
int
mx
=
(
kx
<
(
gridx
+
1
)
/
2
)
?
kx
:
kx
-
gridx
;
float
mhx
=
mx
*
invPeriodicBoxSizeX
;
float
bx
=
scaleFactor
*
bsplineModuli
[
0
][
kx
];
for
(
int
ky
=
0
;
ky
<
gridy
;
ky
++
)
{
int
my
=
(
ky
<
(
gridy
+
1
)
/
2
)
?
ky
:
ky
-
gridy
;
float
mhy
=
my
*
invPeriodicBoxSizeY
;
float
mhx2y2
=
mhx
*
mhx
+
mhy
*
mhy
;
float
bxby
=
bx
*
bsplineModuli
[
1
][
ky
];
for
(
int
kz
=
firstz
;
kz
<
zsize
;
kz
++
)
{
int
index
=
kx
*
yzsize
+
ky
*
zsize
+
kz
;
int
mz
=
(
kz
<
(
gridz
+
1
)
/
2
)
?
kz
:
kz
-
gridz
;
float
mhz
=
mz
*
invPeriodicBoxSizeZ
;
float
bz
=
bsplineModuli
[
2
][
kz
];
float
m2
=
mhx2y2
+
mhz
*
mhz
;
float
denom
=
m2
*
bxby
*
bz
;
float
eterm
=
exp
(
-
recipExpFactor
*
m2
)
/
denom
;
grid
[
index
][
0
]
*=
eterm
;
grid
[
index
][
1
]
*=
eterm
;
}
firstz
=
0
;
}
}
}
static
void
interpolateForces
(
int
start
,
int
end
,
float
*
posq
,
float
*
force
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
__m128
gridSize
=
_mm_set_ps
(
0
,
gridz
,
gridy
,
gridx
);
__m128
gridSizeInt
=
_mm_set_epi32
(
0
,
gridz
,
gridy
,
gridx
);
__m128
one
=
_mm_set1_ps
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_load_ps
(
&
posq
[
4
*
i
]);
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
_mm_floor_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
))));
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
);
__m128
ti
=
_mm_cvttps_epi32
(
t
);
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
));
__m128
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)));
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
__m128
ddata
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
();
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
);
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]);
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
),
dr
),
data
[
j
-
k
-
1
])));
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)),
data
[
0
]);
}
ddata
[
0
]
=
_mm_sub_ps
(
_mm_set1_ps
(
0
),
data
[
0
]);
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
_mm_sub_ps
(
data
[
j
-
1
],
data
[
j
]);
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]);
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
),
dr
),
data
[
PME_ORDER
-
j
-
1
])));
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)),
data
[
0
]);
// Compute the force on this atom.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
__m128
zdata
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
zdata
[
j
]
=
_mm_set_ps
(
0
,
extractFloat
(
ddata
[
j
],
2
),
extractFloat
(
data
[
j
],
2
),
extractFloat
(
data
[
j
],
2
));
__m128
f
=
_mm_set1_ps
(
0
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
dx
=
extractFloat
(
data
[
ix
],
0
);
float
ddx
=
extractFloat
(
ddata
[
ix
],
0
);
__m128
xdata
=
_mm_set_ps
(
0
,
dx
,
dx
,
ddx
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
dy
=
extractFloat
(
data
[
iy
],
1
);
float
ddy
=
extractFloat
(
ddata
[
iy
],
1
);
__m128
xydata
=
_mm_mul_ps
(
xdata
,
_mm_set_ps
(
0
,
dy
,
ddy
,
dy
));
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndexZ
+
iz
;
zindex
-=
(
zindex
>=
gridz
?
gridz
:
0
);
__m128
gridValue
=
_mm_set1_ps
(
grid
[
ybase
+
zindex
]);
f
=
_mm_add_ps
(
f
,
_mm_mul_ps
(
xydata
,
_mm_mul_ps
(
zdata
[
iz
],
gridValue
)));
}
}
}
f
=
_mm_mul_ps
(
invBoxSize
,
_mm_mul_ps
(
gridSize
,
_mm_mul_ps
(
f
,
_mm_set1_ps
(
-
epsilonFactor
*
posq
[
4
*
i
+
3
]))));
_mm_store_ps
(
&
force
[
4
*
i
],
_mm_add_ps
(
_mm_load_ps
(
&
force
[
4
*
i
]),
f
));
}
}
class
CpuCalcPmeReciprocalForceKernel
::
ThreadData
{
public:
CpuCalcPmeReciprocalForceKernel
&
owner
;
int
index
;
float
*
tempGrid
;
ThreadData
(
CpuCalcPmeReciprocalForceKernel
&
owner
,
int
index
)
:
owner
(
owner
),
index
(
index
),
tempGrid
(
NULL
)
{
}
};
static
void
*
threadBody
(
void
*
args
)
{
CpuCalcPmeReciprocalForceKernel
::
ThreadData
&
data
=
*
reinterpret_cast
<
CpuCalcPmeReciprocalForceKernel
::
ThreadData
*>
(
args
);
data
.
owner
.
runThread
(
data
.
index
);
if
(
data
.
tempGrid
!=
NULL
)
fftwf_free
(
data
.
tempGrid
);
delete
&
data
;
return
0
;
}
void
CpuCalcPmeReciprocalForceKernel
::
initialize
(
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
double
alpha
)
{
if
(
!
hasInitializedThreads
)
{
numThreads
=
getNumProcessors
();
fftwf_init_threads
();
hasInitializedThreads
=
true
;
}
this
->
gridx
=
gridx
;
this
->
gridy
=
gridy
;
this
->
gridz
=
gridz
;
this
->
numParticles
=
numParticles
;
this
->
alpha
=
alpha
;
force
.
resize
(
4
*
numParticles
);
// Initialize threads.
pthread_cond_init
(
&
startCondition
,
NULL
);
pthread_cond_init
(
&
endCondition
,
NULL
);
pthread_cond_init
(
&
mainThreadStartCondition
,
NULL
);
pthread_cond_init
(
&
mainThreadEndCondition
,
NULL
);
pthread_mutex_init
(
&
lock
,
NULL
);
thread
.
resize
(
numThreads
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
{
ThreadData
*
data
=
new
ThreadData
(
*
this
,
i
);
threadData
.
push_back
(
data
);
pthread_create
(
&
thread
[
i
],
NULL
,
threadBody
,
data
);
data
->
tempGrid
=
(
float
*
)
fftwf_malloc
(
sizeof
(
float
)
*
(
gridx
*
gridy
*
gridz
+
3
));
}
pthread_create
(
&
mainThread
,
NULL
,
threadBody
,
new
ThreadData
(
*
this
,
-
1
));
// Initialize FFTW.
realGrid
=
threadData
[
0
]
->
tempGrid
;
complexGrid
=
(
fftwf_complex
*
)
fftwf_malloc
(
sizeof
(
fftwf_complex
)
*
gridx
*
gridy
*
(
gridz
/
2
+
1
));
fftwf_plan_with_nthreads
(
numThreads
);
forwardFFT
=
fftwf_plan_dft_r2c_3d
(
gridx
,
gridy
,
gridz
,
realGrid
,
complexGrid
,
FFTW_MEASURE
);
backwardFFT
=
fftwf_plan_dft_c2r_3d
(
gridx
,
gridy
,
gridz
,
complexGrid
,
realGrid
,
FFTW_MEASURE
);
hasCreatedPlan
=
true
;
// Initialize the b-spline moduli.
int
maxSize
=
max
(
max
(
gridx
,
gridy
),
gridz
);
vector
<
double
>
data
(
PME_ORDER
);
vector
<
double
>
ddata
(
PME_ORDER
);
vector
<
double
>
bsplinesData
(
maxSize
);
data
[
PME_ORDER
-
1
]
=
0.0
;
data
[
1
]
=
0.0
;
data
[
0
]
=
1.0
;
for
(
int
i
=
3
;
i
<
PME_ORDER
;
i
++
)
{
double
div
=
1.0
/
(
i
-
1.0
);
data
[
i
-
1
]
=
0.0
;
for
(
int
j
=
1
;
j
<
(
i
-
1
);
j
++
)
data
[
i
-
j
-
1
]
=
div
*
(
j
*
data
[
i
-
j
-
2
]
+
(
i
-
j
)
*
data
[
i
-
j
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
}
// Differentiate.
ddata
[
0
]
=
-
data
[
0
];
for
(
int
i
=
1
;
i
<
PME_ORDER
;
i
++
)
ddata
[
i
]
=
data
[
i
-
1
]
-
data
[
i
];
double
div
=
1.0
/
(
PME_ORDER
-
1
);
data
[
PME_ORDER
-
1
]
=
0.0
;
for
(
int
i
=
1
;
i
<
(
PME_ORDER
-
1
);
i
++
)
data
[
PME_ORDER
-
i
-
1
]
=
div
*
(
i
*
data
[
PME_ORDER
-
i
-
2
]
+
(
PME_ORDER
-
i
)
*
data
[
PME_ORDER
-
i
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
for
(
int
i
=
0
;
i
<
maxSize
;
i
++
)
bsplinesData
[
i
]
=
0.0
;
for
(
int
i
=
1
;
i
<=
PME_ORDER
;
i
++
)
bsplinesData
[
i
]
=
data
[
i
-
1
];
// Evaluate the actual bspline moduli for X/Y/Z.
bsplineModuli
[
0
].
resize
(
gridx
);
bsplineModuli
[
1
].
resize
(
gridy
);
bsplineModuli
[
2
].
resize
(
gridz
);
for
(
int
dim
=
0
;
dim
<
3
;
dim
++
)
{
int
ndata
=
bsplineModuli
[
dim
].
size
();
vector
<
float
>&
moduli
=
bsplineModuli
[
dim
];
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
{
double
sc
=
0.0
;
double
ss
=
0.0
;
for
(
int
j
=
0
;
j
<
ndata
;
j
++
)
{
double
arg
=
(
2.0
*
M_PI
*
i
*
j
)
/
ndata
;
sc
+=
bsplinesData
[
j
]
*
cos
(
arg
);
ss
+=
bsplinesData
[
j
]
*
sin
(
arg
);
}
moduli
[
i
]
=
(
float
)
(
sc
*
sc
+
ss
*
ss
);
}
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
if
(
moduli
[
i
]
<
1.0e-7
f
)
moduli
[
i
]
=
(
moduli
[
i
-
1
]
+
moduli
[
i
+
1
])
*
0.5
f
;
}
}
CpuCalcPmeReciprocalForceKernel
::~
CpuCalcPmeReciprocalForceKernel
()
{
isDeleted
=
true
;
pthread_mutex_lock
(
&
lock
);
pthread_cond_broadcast
(
&
startCondition
);
pthread_cond_broadcast
(
&
mainThreadStartCondition
);
pthread_mutex_unlock
(
&
lock
);
for
(
int
i
=
0
;
i
<
(
int
)
thread
.
size
();
i
++
)
pthread_join
(
thread
[
i
],
NULL
);
pthread_join
(
mainThread
,
NULL
);
pthread_mutex_destroy
(
&
lock
);
pthread_cond_destroy
(
&
startCondition
);
pthread_cond_destroy
(
&
endCondition
);
pthread_cond_destroy
(
&
mainThreadStartCondition
);
pthread_cond_destroy
(
&
mainThreadEndCondition
);
if
(
complexGrid
!=
NULL
)
fftwf_free
(
complexGrid
);
if
(
hasCreatedPlan
)
{
fftwf_destroy_plan
(
forwardFFT
);
fftwf_destroy_plan
(
backwardFFT
);
}
}
#include <sys/time.h>
double
diff
(
struct
timeval
t1
,
struct
timeval
t2
)
{
return
t2
.
tv_usec
-
t1
.
tv_usec
+
1e6
*
(
t2
.
tv_sec
-
t1
.
tv_sec
);
}
void
CpuCalcPmeReciprocalForceKernel
::
runThread
(
int
index
)
{
if
(
index
==
-
1
)
{
// This is the main thread that coordinates all the other ones.
pthread_mutex_lock
(
&
lock
);
while
(
true
)
{
// Wait for the signal to start.
pthread_cond_wait
(
&
mainThreadStartCondition
,
&
lock
);
if
(
isDeleted
)
break
;
posq
=
io
->
getPosq
();
struct
timeval
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
;
gettimeofday
(
&
t1
,
NULL
);
advanceThreads
();
// Signal threads to perform charge spreading.
advanceThreads
();
// Signal threads to sum the charge grids.
gettimeofday
(
&
t2
,
NULL
);
fftwf_execute_dft_r2c
(
forwardFFT
,
realGrid
,
complexGrid
);
gettimeofday
(
&
t3
,
NULL
);
if
(
includeEnergy
)
advanceThreads
();
// Signal threads to compute energy.
gettimeofday
(
&
t4
,
NULL
);
advanceThreads
();
// Signal threads to perform reciprocal convolution.
gettimeofday
(
&
t5
,
NULL
);
fftwf_execute_dft_c2r
(
backwardFFT
,
complexGrid
,
realGrid
);
gettimeofday
(
&
t6
,
NULL
);
advanceThreads
();
// Signal threads to interpolate forces.
isFinished
=
true
;
gettimeofday
(
&
t7
,
NULL
);
printf
(
"time %g %g %g %g %g %g
\n
"
,
diff
(
t1
,
t2
),
diff
(
t2
,
t3
),
diff
(
t3
,
t4
),
diff
(
t4
,
t5
),
diff
(
t5
,
t6
),
diff
(
t6
,
t7
));
pthread_cond_signal
(
&
mainThreadEndCondition
);
}
pthread_mutex_unlock
(
&
lock
);
}
else
{
// This is a worker thread.
int
particleStart
=
(
index
*
numParticles
)
/
numThreads
;
int
particleEnd
=
((
index
+
1
)
*
numParticles
)
/
numThreads
;
int
gridxStart
=
(
index
*
gridx
)
/
numThreads
;
int
gridxEnd
=
((
index
+
1
)
*
gridx
)
/
numThreads
;
int
gridSize
=
(
gridx
*
gridy
*
gridz
+
3
)
/
4
;
int
gridStart
=
4
*
((
index
*
gridSize
)
/
numThreads
);
int
gridEnd
=
4
*
(((
index
+
1
)
*
gridSize
)
/
numThreads
);
while
(
true
)
{
threadWait
();
if
(
isDeleted
)
break
;
spreadCharge
(
particleStart
,
particleEnd
,
posq
,
threadData
[
index
]
->
tempGrid
,
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxSize
);
threadWait
();
int
numGrids
=
threadData
.
size
();
for
(
int
i
=
gridStart
;
i
<
gridEnd
;
i
+=
4
)
{
__m128
sum
=
_mm_load_ps
(
&
realGrid
[
i
]);
for
(
int
j
=
1
;
j
<
numGrids
;
j
++
)
sum
=
_mm_add_ps
(
sum
,
_mm_load_ps
(
&
threadData
[
j
]
->
tempGrid
[
i
]));
_mm_store_ps
(
&
realGrid
[
i
],
sum
);
}
threadWait
();
if
(
includeEnergy
)
{
double
threadEnergy
=
reciprocalEnergy
(
gridxStart
,
gridxEnd
,
complexGrid
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxSize
);
pthread_mutex_lock
(
&
lock
);
energy
+=
threadEnergy
;
pthread_mutex_unlock
(
&
lock
);
threadWait
();
}
reciprocalConvolution
(
gridxStart
,
gridxEnd
,
complexGrid
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxSize
);
threadWait
();
interpolateForces
(
particleStart
,
particleEnd
,
posq
,
&
force
[
0
],
realGrid
,
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxSize
);
}
}
}
void
CpuCalcPmeReciprocalForceKernel
::
threadWait
()
{
pthread_mutex_lock
(
&
lock
);
waitCount
++
;
pthread_cond_signal
(
&
endCondition
);
pthread_cond_wait
(
&
startCondition
,
&
lock
);
pthread_mutex_unlock
(
&
lock
);
}
void
CpuCalcPmeReciprocalForceKernel
::
advanceThreads
()
{
waitCount
=
0
;
pthread_cond_broadcast
(
&
startCondition
);
while
(
waitCount
<
numThreads
)
{
pthread_cond_wait
(
&
endCondition
,
&
lock
);
}
}
void
CpuCalcPmeReciprocalForceKernel
::
beginComputation
(
IO
&
io
,
Vec3
periodicBoxSize
,
bool
includeEnergy
)
{
this
->
io
=
&
io
;
this
->
periodicBoxSize
=
periodicBoxSize
;
this
->
includeEnergy
=
includeEnergy
;
energy
=
0.0
;
pthread_mutex_lock
(
&
lock
);
isFinished
=
false
;
pthread_cond_signal
(
&
mainThreadStartCondition
);
pthread_mutex_unlock
(
&
lock
);
}
double
CpuCalcPmeReciprocalForceKernel
::
finishComputation
(
IO
&
io
)
{
pthread_mutex_lock
(
&
lock
);
while
(
!
isFinished
)
{
pthread_cond_wait
(
&
mainThreadEndCondition
,
&
lock
);
}
pthread_mutex_unlock
(
&
lock
);
io
.
setForce
(
&
force
[
0
]);
return
energy
;
}
bool
CpuCalcPmeReciprocalForceKernel
::
isProcessorSupported
()
{
int
cpuInfo
[
4
];
cpuid
(
cpuInfo
,
0
);
if
(
cpuInfo
[
0
]
>=
1
)
{
cpuid
(
cpuInfo
,
1
);
return
((
cpuInfo
[
2
]
&
((
int
)
1
<<
19
))
!=
0
);
// Require SSE 4.1
}
return
false
;
}
\ No newline at end of file
plugins/cpupme/src/CpuPmeKernels.h
0 → 100644
View file @
68f8aed8
#ifndef OPENMM_CPU_PME_KERNELS_H_
#define OPENMM_CPU_PME_KERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "internal/windowsExportPme.h"
#include "openmm/kernels.h"
#include "openmm/Vec3.h"
#include <fftw3.h>
#include <pthread.h>
#include <vector>
namespace
OpenMM
{
/**
*/
class
OPENMM_EXPORT_PME
CpuCalcPmeReciprocalForceKernel
:
public
CalcPmeReciprocalForceKernel
{
public:
class
ThreadData
;
CpuCalcPmeReciprocalForceKernel
(
std
::
string
name
,
const
Platform
&
platform
)
:
CalcPmeReciprocalForceKernel
(
name
,
platform
),
hasCreatedPlan
(
false
),
isDeleted
(
false
),
realGrid
(
NULL
),
complexGrid
(
NULL
)
{
}
void
initialize
(
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
double
alpha
);
~
CpuCalcPmeReciprocalForceKernel
();
void
beginComputation
(
IO
&
io
,
Vec3
periodicBoxSize
,
bool
includeEnergy
);
double
finishComputation
(
IO
&
io
);
void
runThread
(
int
index
);
static
bool
isProcessorSupported
();
private:
void
threadWait
();
void
advanceThreads
();
static
bool
hasInitializedThreads
;
static
int
numThreads
;
int
gridx
,
gridy
,
gridz
,
numParticles
;
double
alpha
;
bool
hasCreatedPlan
,
isFinished
,
isDeleted
;
std
::
vector
<
float
>
force
;
std
::
vector
<
float
>
bsplineModuli
[
3
];
float
*
realGrid
;
fftwf_complex
*
complexGrid
;
fftwf_plan
forwardFFT
,
backwardFFT
;
int
waitCount
;
pthread_cond_t
startCondition
,
endCondition
;
pthread_cond_t
mainThreadStartCondition
,
mainThreadEndCondition
;
pthread_mutex_t
lock
;
pthread_t
mainThread
;
std
::
vector
<
pthread_t
>
thread
;
std
::
vector
<
ThreadData
*>
threadData
;
// The following variables are used to store information about the calculation currently being performed.
IO
*
io
;
float
energy
;
float
*
posq
;
Vec3
periodicBoxSize
;
bool
includeEnergy
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CPU_PME_KERNELS_H_*/
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment