Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
932a61b6
"openmmapi/src/ThreadPool.cpp" did not exist on "0be0772b186c1f7193f932b75f355b718ebbb04a"
Commit
932a61b6
authored
Jun 21, 2013
by
peastman
Browse files
Connected CPU based PME to CUDA platform
parent
56c071cc
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
299 additions
and
116 deletions
+299
-116
cmake_modules/FindFFTW.cmake
cmake_modules/FindFFTW.cmake
+25
-0
platforms/cuda/sharedTarget/CMakeLists.txt
platforms/cuda/sharedTarget/CMakeLists.txt
+3
-2
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+12
-0
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+59
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+180
-109
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+7
-1
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+13
-3
No files found.
cmake_modules/FindFFTW.cmake
0 → 100644
View file @
932a61b6
# - Find FFTW
# Find the native FFTW includes and library
#
# FFTW_INCLUDES - where to find fftw3.h
# FFTW_LIBRARY - the main FFTW library.
# FFTW_THREADS_LIBRARY - the FFTW multithreading support library.
# FFTW_FOUND - True if FFTW found.
if
(
FFTW_INCLUDES
)
# Already in cache, be silent
set
(
FFTW_FIND_QUIETLY TRUE
)
endif
(
FFTW_INCLUDES
)
find_path
(
FFTW_INCLUDES fftw3.h
)
find_library
(
FFTW_LIBRARY NAMES fftw3f
)
find_library
(
FFTW_THREADS_LIBRARY NAMES fftw3f_threads
)
# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
# all listed variables are TRUE
include
(
FindPackageHandleStandardArgs
)
find_package_handle_standard_args
(
FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES
)
find_package_handle_standard_args
(
FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES
)
mark_as_advanced
(
FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES
)
platforms/cuda/sharedTarget/CMakeLists.txt
View file @
932a61b6
...
@@ -2,7 +2,8 @@
...
@@ -2,7 +2,8 @@
# Include CUDA related files.
# Include CUDA related files.
#
#
INCLUDE
(
FindCUDA
)
INCLUDE
(
FindCUDA
)
INCLUDE_DIRECTORIES
(
${
CUDA_TOOLKIT_INCLUDE
}
)
INCLUDE
(
FindFFTW
)
INCLUDE_DIRECTORIES
(
${
CUDA_TOOLKIT_INCLUDE
}
${
FFTW_INCLUDES
}
)
FILE
(
GLOB CUDA_KERNELS
${
CUDA_SOURCE_DIR
}
/kernels/*.cu
)
FILE
(
GLOB CUDA_KERNELS
${
CUDA_SOURCE_DIR
}
/kernels/*.cu
)
ADD_CUSTOM_COMMAND
(
OUTPUT
${
CUDA_KERNELS_CPP
}
${
CUDA_KERNELS_H
}
ADD_CUSTOM_COMMAND
(
OUTPUT
${
CUDA_KERNELS_CPP
}
${
CUDA_KERNELS_H
}
...
@@ -18,7 +19,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
...
@@ -18,7 +19,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
ELSE
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
ELSE
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
CUDA_CUDA_LIBRARY
}
${
CUDA_cufft_LIBRARY
}
${
PTHREADS_LIB
}
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
CUDA_CUDA_LIBRARY
}
${
CUDA_cufft_LIBRARY
}
${
PTHREADS_LIB
}
${
FFTW_LIBRARY
}
${
FFTW_THREADS_LIBRARY
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES COMPILE_FLAGS
"-DOPENMM_CUDA_BUILDING_SHARED_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES COMPILE_FLAGS
"-DOPENMM_CUDA_BUILDING_SHARED_LIBRARY"
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
platforms/cuda/src/CudaContext.cpp
View file @
932a61b6
...
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
...
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
delete
forces
[
i
];
delete
forces
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
delete
reorderListeners
[
i
];
delete
reorderListeners
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
preComputations
.
size
();
i
++
)
delete
preComputations
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
postComputations
.
size
();
i
++
)
delete
postComputations
[
i
];
if
(
pinnedBuffer
!=
NULL
)
if
(
pinnedBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedBuffer
);
cuMemFreeHost
(
pinnedBuffer
);
if
(
posq
!=
NULL
)
if
(
posq
!=
NULL
)
...
@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
...
@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
reorderListeners
.
push_back
(
listener
);
reorderListeners
.
push_back
(
listener
);
}
}
void
CudaContext
::
addPreComputation
(
ForcePreComputation
*
computation
)
{
preComputations
.
push_back
(
computation
);
}
void
CudaContext
::
addPostComputation
(
ForcePostComputation
*
computation
)
{
postComputations
.
push_back
(
computation
);
}
struct
CudaContext
::
WorkThread
::
ThreadData
{
struct
CudaContext
::
WorkThread
::
ThreadData
{
ThreadData
(
std
::
queue
<
CudaContext
::
WorkTask
*>&
tasks
,
bool
&
waiting
,
bool
&
finished
,
ThreadData
(
std
::
queue
<
CudaContext
::
WorkTask
*>&
tasks
,
bool
&
waiting
,
bool
&
finished
,
pthread_mutex_t
&
queueLock
,
pthread_cond_t
&
waitForTaskCondition
,
pthread_cond_t
&
queueEmptyCondition
)
:
pthread_mutex_t
&
queueLock
,
pthread_cond_t
&
waitForTaskCondition
,
pthread_cond_t
&
queueEmptyCondition
)
:
...
...
platforms/cuda/src/CudaContext.h
View file @
932a61b6
...
@@ -70,6 +70,8 @@ public:
...
@@ -70,6 +70,8 @@ public:
class
WorkTask
;
class
WorkTask
;
class
WorkThread
;
class
WorkThread
;
class
ReorderListener
;
class
ReorderListener
;
class
ForcePreComputation
;
class
ForcePostComputation
;
static
const
int
ThreadBlockSize
;
static
const
int
ThreadBlockSize
;
static
const
int
TileSize
;
static
const
int
TileSize
;
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
...
@@ -454,6 +456,28 @@ public:
...
@@ -454,6 +456,28 @@ public:
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
return
reorderListeners
;
}
}
/**
* Add a pre-computation that should be called at the very start of force and energy evalutations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addPreComputation
(
ForcePreComputation
*
computation
);
/**
* Get the list of ForcePreComputations.
*/
std
::
vector
<
ForcePreComputation
*>&
getPreComputations
()
{
return
preComputations
;
}
/**
* Add a post-computation that should be called at the very end of force and energy evalutations.
* The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addPostComputation
(
ForcePostComputation
*
computation
);
/**
* Get the list of ForcePostComputations.
*/
std
::
vector
<
ForcePostComputation
*>&
getPostComputations
()
{
return
postComputations
;
}
/**
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* This should be called whenever force field parameters change. It will cause the definitions
...
@@ -519,6 +543,8 @@ private:
...
@@ -519,6 +543,8 @@ private:
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
std
::
vector
<
ForcePreComputation
*>
preComputations
;
std
::
vector
<
ForcePostComputation
*>
postComputations
;
CudaIntegrationUtilities
*
integration
;
CudaIntegrationUtilities
*
integration
;
CudaExpressionUtilities
*
expression
;
CudaExpressionUtilities
*
expression
;
CudaBondedUtilities
*
bonded
;
CudaBondedUtilities
*
bonded
;
...
@@ -580,7 +606,7 @@ private:
...
@@ -580,7 +606,7 @@ private:
/**
/**
* This abstract class defines a function to be executed whenever atoms get reordered.
* This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a
r
eorderListener
* Objects that need to know when reordering happens should create a
R
eorderListener
* and register it by calling addReorderListener().
* and register it by calling addReorderListener().
*/
*/
class
CudaContext
::
ReorderListener
{
class
CudaContext
::
ReorderListener
{
...
@@ -590,6 +616,38 @@ public:
...
@@ -590,6 +616,38 @@ public:
}
}
};
};
/**
* This abstract class defines a function to be executed at the very beginning of force and
* energy evaluation, before any other calculation has been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePreComputation, register it by calling addForcePreComputation().
*/
class
CudaContext
::
ForcePreComputation
{
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
*/
virtual
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
=
0
;
};
/**
* This abstract class defines a function to be executed at the very end of force and
* energy evaluation, after all other calculations have been done. It is useful for operations
* that need to be performed at a nonstandard point in the process. After creating a
* ForcePostComputation, register it by calling addForcePostComputation().
*/
class
CudaContext
::
ForcePostComputation
{
public:
/**
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
* @return an optional contribution to add to the potential energy. */
virtual
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
=
0
;
};
}
// namespace OpenMM
}
// namespace OpenMM
#endif
/*OPENMM_CUDACONTEXT_H_*/
#endif
/*OPENMM_CUDACONTEXT_H_*/
platforms/cuda/src/CudaKernels.cpp
View file @
932a61b6
...
@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
void
CudaCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
void
CudaCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
cu
.
setAsCurrent
();
cu
.
setAsCurrent
();
cu
.
clearAutoclearBuffers
();
for
(
vector
<
CudaContext
::
ForcePreComputation
*>::
iterator
iter
=
cu
.
getPreComputations
().
begin
();
iter
!=
cu
.
getPreComputations
().
end
();
++
iter
)
(
*
iter
)
->
computeForceAndEnergy
(
includeForces
,
includeEnergy
,
groups
);
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
bool
includeNonbonded
=
((
groups
&
(
1
<<
nb
.
getForceGroup
()))
!=
0
);
bool
includeNonbonded
=
((
groups
&
(
1
<<
nb
.
getForceGroup
()))
!=
0
);
cu
.
setComputeForceCount
(
cu
.
getComputeForceCount
()
+
1
);
cu
.
setComputeForceCount
(
cu
.
getComputeForceCount
()
+
1
);
cu
.
clearAutoclearBuffers
();
if
(
includeNonbonded
)
if
(
includeNonbonded
)
nb
.
prepareInteractions
();
nb
.
prepareInteractions
();
}
}
...
@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
...
@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu
.
getBondedUtilities
().
computeInteractions
(
groups
);
cu
.
getBondedUtilities
().
computeInteractions
(
groups
);
if
((
groups
&
(
1
<<
cu
.
getNonbondedUtilities
().
getForceGroup
()))
!=
0
)
if
((
groups
&
(
1
<<
cu
.
getNonbondedUtilities
().
getForceGroup
()))
!=
0
)
cu
.
getNonbondedUtilities
().
computeInteractions
();
cu
.
getNonbondedUtilities
().
computeInteractions
();
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
double
sum
=
0.0
;
double
sum
=
0.0
;
for
(
vector
<
CudaContext
::
ForcePostComputation
*>::
iterator
iter
=
cu
.
getPostComputations
().
begin
();
iter
!=
cu
.
getPostComputations
().
end
();
++
iter
)
sum
+=
(
*
iter
)
->
computeForceAndEnergy
(
includeForces
,
includeEnergy
,
groups
);
cu
.
getIntegrationUtilities
().
distributeForcesFromVirtualSites
();
if
(
includeEnergy
)
{
if
(
includeEnergy
)
{
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
CudaArray
&
energyArray
=
cu
.
getEnergyBuffer
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
())
{
...
@@ -1330,6 +1334,59 @@ private:
...
@@ -1330,6 +1334,59 @@ private:
const
NonbondedForce
&
force
;
const
NonbondedForce
&
force
;
};
};
class
CudaCalcNonbondedForceKernel
::
PmeIO
:
public
CpuPme
::
IO
{
public:
PmeIO
(
CudaContext
&
cu
,
CUfunction
addForcesKernel
)
:
cu
(
cu
),
addForcesKernel
(
addForcesKernel
),
forceTemp
(
NULL
)
{
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
));
forceTemp
=
new
CudaArray
(
cu
,
cu
.
getNumAtoms
(),
elementSize
,
"PmeForce"
);
}
~
PmeIO
()
{
if
(
forceTemp
!=
NULL
)
delete
forceTemp
;
}
float
*
getPosq
()
{
cu
.
setAsCurrent
();
cu
.
getPosq
().
download
(
posq
);
return
(
float
*
)
&
posq
[
0
];
}
void
setForce
(
float
*
force
)
{
forceTemp
->
upload
(
force
);
void
*
args
[]
=
{
&
forceTemp
->
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
()};
cu
.
executeKernel
(
addForcesKernel
,
args
,
cu
.
getNumAtoms
());
}
private:
CudaContext
&
cu
;
vector
<
float4
>
posq
;
CudaArray
*
forceTemp
;
CUfunction
addForcesKernel
;
};
class
CudaCalcNonbondedForceKernel
::
PmePreComputation
:
public
CudaContext
::
ForcePreComputation
{
public:
PmePreComputation
(
CudaContext
&
cu
,
CpuPme
&
pme
,
CpuPme
::
IO
&
io
)
:
cu
(
cu
),
pme
(
pme
),
io
(
io
)
{
}
void
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
Vec3
boxSize
(
cu
.
getPeriodicBoxSize
().
x
,
cu
.
getPeriodicBoxSize
().
y
,
cu
.
getPeriodicBoxSize
().
z
);
pme
.
beginComputation
(
io
,
boxSize
,
includeEnergy
);
}
private:
CudaContext
&
cu
;
CpuPme
&
pme
;
CpuPme
::
IO
&
io
;
};
class
CudaCalcNonbondedForceKernel
::
PmePostComputation
:
public
CudaContext
::
ForcePostComputation
{
public:
PmePostComputation
(
CpuPme
&
pme
,
CpuPme
::
IO
&
io
)
:
pme
(
pme
),
io
(
io
)
{
}
double
computeForceAndEnergy
(
bool
includeForces
,
bool
includeEnergy
,
int
groups
)
{
return
pme
.
finishComputation
(
io
);
}
private:
CpuPme
&
pme
;
CpuPme
::
IO
&
io
;
};
CudaCalcNonbondedForceKernel
::~
CudaCalcNonbondedForceKernel
()
{
CudaCalcNonbondedForceKernel
::~
CudaCalcNonbondedForceKernel
()
{
cu
.
setAsCurrent
();
cu
.
setAsCurrent
();
if
(
sigmaEpsilon
!=
NULL
)
if
(
sigmaEpsilon
!=
NULL
)
...
@@ -1354,6 +1411,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1354,6 +1411,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeAtomGridIndex
;
delete
pmeAtomGridIndex
;
if
(
sort
!=
NULL
)
if
(
sort
!=
NULL
)
delete
sort
;
delete
sort
;
if
(
cpuPme
!=
NULL
)
delete
cpuPme
;
if
(
pmeio
!=
NULL
)
delete
pmeio
;
if
(
hasInitializedFFT
)
{
if
(
hasInitializedFFT
)
{
cufftDestroy
(
fftForward
);
cufftDestroy
(
fftForward
);
cufftDestroy
(
fftBackward
);
cufftDestroy
(
fftBackward
);
...
@@ -1457,7 +1518,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1457,7 +1518,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
else
else
dispersionCoefficient
=
0.0
;
dispersionCoefficient
=
0.0
;
alpha
=
0
;
alpha
=
0
;
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
Ewald
)
{
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
Ewald
&&
cu
.
getContextIndex
()
==
0
)
{
// Compute the Ewald parameters.
// Compute the Ewald parameters.
int
kmaxx
,
kmaxy
,
kmaxz
;
int
kmaxx
,
kmaxy
,
kmaxz
;
...
@@ -1465,7 +1526,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1465,7 +1526,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"USE_EWALD"
]
=
"1"
;
defines
[
"USE_EWALD"
]
=
"1"
;
ewaldSelfEnergy
=
(
cu
.
getContextIndex
()
==
0
?
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
)
:
0.0
)
;
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
// Create the reciprocal space kernels.
// Create the reciprocal space kernels.
...
@@ -1484,7 +1545,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1484,7 +1545,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double2
)
:
sizeof
(
float2
));
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double2
)
:
sizeof
(
float2
));
cosSinSums
=
new
CudaArray
(
cu
,
(
2
*
kmaxx
-
1
)
*
(
2
*
kmaxy
-
1
)
*
(
2
*
kmaxz
-
1
),
elementSize
,
"cosSinSums"
);
cosSinSums
=
new
CudaArray
(
cu
,
(
2
*
kmaxx
-
1
)
*
(
2
*
kmaxy
-
1
)
*
(
2
*
kmaxz
-
1
),
elementSize
,
"cosSinSums"
);
}
}
else
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
PME
)
{
else
if
(
force
.
getNonbondedMethod
()
==
NonbondedForce
::
PME
&&
cu
.
getContextIndex
()
==
0
)
{
// Compute the PME parameters.
// Compute the PME parameters.
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
...
@@ -1497,7 +1558,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1497,7 +1558,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"USE_EWALD"
]
=
"1"
;
defines
[
"USE_EWALD"
]
=
"1"
;
ewaldSelfEnergy
=
(
cu
.
getContextIndex
()
==
0
?
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
)
:
0.0
)
;
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
pmeDefines
[
"PME_ORDER"
]
=
cu
.
intToString
(
PmeOrder
);
pmeDefines
[
"PME_ORDER"
]
=
cu
.
intToString
(
PmeOrder
);
pmeDefines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numParticles
);
pmeDefines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numParticles
);
pmeDefines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pmeDefines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
...
@@ -1510,111 +1571,121 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1510,111 +1571,121 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
bool
useCpuPme
=
false
;
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
if
(
useCpuPme
)
{
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
cpuPme
=
new
CpuPme
(
gridSizeX
,
gridSizeY
,
gridSizeZ
,
numParticles
,
alpha
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
CUfunction
addForcesKernel
=
cu
.
getKernel
(
module
,
"addForces"
);
pmeEvalEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
pmeio
=
new
PmeIO
(
cu
,
addForcesKernel
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cu
.
addPreComputation
(
new
PmePreComputation
(
cu
,
*
cpuPme
,
*
pmeio
));
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cu
.
addPostComputation
(
new
PmePostComputation
(
*
cpuPme
,
*
pmeio
));
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
}
else
{
// Create required data structures.
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
directPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
cu
.
getComputeCapability
()
>=
2.0
?
elementSize
:
sizeof
(
long
long
),
"originalPmeGrid"
);
pmeEvalEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
reciprocalPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
(
gridSizeZ
/
2
+
1
),
2
*
elementSize
,
"reciprocalPmeGrid"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cu
.
addAutoclearBuffer
(
*
directPmeGrid
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
// Create required data structures.
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
directPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
cu
.
getComputeCapability
()
>=
2.0
?
elementSize
:
sizeof
(
long
long
),
"originalPmeGrid"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
reciprocalPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
(
gridSizeZ
/
2
+
1
),
2
*
elementSize
,
"reciprocalPmeGrid"
);
cufftResult
result
=
cufftPlan3d
(
&
fftForward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_D2Z
:
CUFFT_R2C
);
cu
.
addAutoclearBuffer
(
*
directPmeGrid
);
if
(
result
!=
CUFFT_SUCCESS
)
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
result
=
cufftPlan3d
(
&
fftBackward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2D
:
CUFFT_C2R
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
if
(
result
!=
CUFFT_SUCCESS
)
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
cufftSetCompatibilityMode
(
fftForward
,
CUFFT_COMPATIBILITY_NATIVE
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
cufftSetCompatibilityMode
(
fftBackward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftResult
result
=
cufftPlan3d
(
&
fftForward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_D2Z
:
CUFFT_R2C
);
if
(
result
!=
CUFFT_SUCCESS
)
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
result
=
cufftPlan3d
(
&
fftBackward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2D
:
CUFFT_C2R
);
if
(
result
!=
CUFFT_SUCCESS
)
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
cufftSetCompatibilityMode
(
fftForward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftSetCompatibilityMode
(
fftBackward
,
CUFFT_COMPATIBILITY_NATIVE
);
hasInitializedFFT
=
true
;
// Initialize the b-spline moduli.
int
maxSize
=
max
(
max
(
gridSizeX
,
gridSizeY
),
gridSizeZ
);
vector
<
double
>
data
(
PmeOrder
);
vector
<
double
>
ddata
(
PmeOrder
);
vector
<
double
>
bsplines_data
(
maxSize
);
data
[
PmeOrder
-
1
]
=
0.0
;
data
[
1
]
=
0.0
;
data
[
0
]
=
1.0
;
for
(
int
i
=
3
;
i
<
PmeOrder
;
i
++
)
{
double
div
=
1.0
/
(
i
-
1.0
);
data
[
i
-
1
]
=
0.0
;
for
(
int
j
=
1
;
j
<
(
i
-
1
);
j
++
)
data
[
i
-
j
-
1
]
=
div
*
(
j
*
data
[
i
-
j
-
2
]
+
(
i
-
j
)
*
data
[
i
-
j
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
}
hasInitializedFFT
=
true
;
// Differentiate.
// Initialize the b-spline moduli.
ddata
[
0
]
=
-
data
[
0
];
for
(
int
i
=
1
;
i
<
PmeOrder
;
i
++
)
int
maxSize
=
max
(
max
(
gridSizeX
,
gridSizeY
),
gridSizeZ
);
ddata
[
i
]
=
data
[
i
-
1
]
-
data
[
i
];
vector
<
double
>
data
(
PmeOrder
);
double
div
=
1.0
/
(
PmeOrder
-
1
);
vector
<
double
>
ddata
(
PmeOrder
);
data
[
PmeOrder
-
1
]
=
0.0
;
vector
<
double
>
bsplines_data
(
maxSize
);
for
(
int
i
=
1
;
i
<
(
PmeOrder
-
1
);
i
++
)
data
[
PmeOrder
-
1
]
=
0.0
;
data
[
PmeOrder
-
i
-
1
]
=
div
*
(
i
*
data
[
PmeOrder
-
i
-
2
]
+
(
PmeOrder
-
i
)
*
data
[
PmeOrder
-
i
-
1
]);
data
[
1
]
=
0.0
;
data
[
0
]
=
1.0
;
for
(
int
i
=
3
;
i
<
PmeOrder
;
i
++
)
{
double
div
=
1.0
/
(
i
-
1.0
);
data
[
i
-
1
]
=
0.0
;
for
(
int
j
=
1
;
j
<
(
i
-
1
);
j
++
)
data
[
i
-
j
-
1
]
=
div
*
(
j
*
data
[
i
-
j
-
2
]
+
(
i
-
j
)
*
data
[
i
-
j
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
data
[
0
]
=
div
*
data
[
0
];
}
for
(
int
i
=
0
;
i
<
maxSize
;
i
++
)
bsplines_data
[
i
]
=
0.0
;
// Differentiate.
for
(
int
i
=
1
;
i
<=
PmeOrder
;
i
++
)
bsplines_data
[
i
]
=
data
[
i
-
1
];
ddata
[
0
]
=
-
data
[
0
];
for
(
int
i
=
1
;
i
<
PmeOrder
;
i
++
)
// Evaluate the actual bspline moduli for X/Y/Z.
ddata
[
i
]
=
data
[
i
-
1
]
-
data
[
i
];
double
div
=
1.0
/
(
PmeOrder
-
1
);
for
(
int
dim
=
0
;
dim
<
3
;
dim
++
)
{
data
[
PmeOrder
-
1
]
=
0.0
;
int
ndata
=
(
dim
==
0
?
gridSizeX
:
dim
==
1
?
gridSizeY
:
gridSizeZ
);
for
(
int
i
=
1
;
i
<
(
PmeOrder
-
1
);
i
++
)
vector
<
double
>
moduli
(
ndata
);
data
[
PmeOrder
-
i
-
1
]
=
div
*
(
i
*
data
[
PmeOrder
-
i
-
2
]
+
(
PmeOrder
-
i
)
*
data
[
PmeOrder
-
i
-
1
]);
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
{
data
[
0
]
=
div
*
data
[
0
];
double
sc
=
0.0
;
for
(
int
i
=
0
;
i
<
maxSize
;
i
++
)
double
ss
=
0.0
;
bsplines_data
[
i
]
=
0.0
;
for
(
int
j
=
0
;
j
<
ndata
;
j
++
)
{
for
(
int
i
=
1
;
i
<=
PmeOrder
;
i
++
)
double
arg
=
(
2.0
*
M_PI
*
i
*
j
)
/
ndata
;
bsplines_data
[
i
]
=
data
[
i
-
1
];
sc
+=
bsplines_data
[
j
]
*
cos
(
arg
);
ss
+=
bsplines_data
[
j
]
*
sin
(
arg
);
// Evaluate the actual bspline moduli for X/Y/Z.
}
moduli
[
i
]
=
sc
*
sc
+
ss
*
ss
;
for
(
int
dim
=
0
;
dim
<
3
;
dim
++
)
{
int
ndata
=
(
dim
==
0
?
gridSizeX
:
dim
==
1
?
gridSizeY
:
gridSizeZ
);
vector
<
double
>
moduli
(
ndata
);
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
{
double
sc
=
0.0
;
double
ss
=
0.0
;
for
(
int
j
=
0
;
j
<
ndata
;
j
++
)
{
double
arg
=
(
2.0
*
M_PI
*
i
*
j
)
/
ndata
;
sc
+=
bsplines_data
[
j
]
*
cos
(
arg
);
ss
+=
bsplines_data
[
j
]
*
sin
(
arg
);
}
}
moduli
[
i
]
=
sc
*
sc
+
ss
*
ss
;
}
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
if
(
moduli
[
i
]
<
1.0e-7
)
moduli
[
i
]
=
(
moduli
[
i
-
1
]
+
moduli
[
i
+
1
])
*
0.5
;
if
(
cu
.
getUseDoublePrecision
())
{
if
(
dim
==
0
)
pmeBsplineModuliX
->
upload
(
moduli
);
else
if
(
dim
==
1
)
pmeBsplineModuliY
->
upload
(
moduli
);
else
pmeBsplineModuliZ
->
upload
(
moduli
);
}
else
{
vector
<
float
>
modulif
(
ndata
);
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
modulif
[
i
]
=
(
float
)
moduli
[
i
];
if
(
moduli
[
i
]
<
1.0e-7
)
if
(
dim
==
0
)
moduli
[
i
]
=
(
moduli
[
i
-
1
]
+
moduli
[
i
+
1
])
*
0.5
;
pmeBsplineModuliX
->
upload
(
modulif
);
if
(
cu
.
getUseDoublePrecision
())
{
else
if
(
dim
==
1
)
if
(
dim
==
0
)
pmeBsplineModuliY
->
upload
(
modulif
);
pmeBsplineModuliX
->
upload
(
moduli
);
else
else
if
(
dim
==
1
)
pmeBsplineModuliZ
->
upload
(
modulif
);
pmeBsplineModuliY
->
upload
(
moduli
);
else
pmeBsplineModuliZ
->
upload
(
moduli
);
}
else
{
vector
<
float
>
modulif
(
ndata
);
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
modulif
[
i
]
=
(
float
)
moduli
[
i
];
if
(
dim
==
0
)
pmeBsplineModuliX
->
upload
(
modulif
);
else
if
(
dim
==
1
)
pmeBsplineModuliY
->
upload
(
modulif
);
else
pmeBsplineModuliZ
->
upload
(
modulif
);
}
}
}
}
}
}
}
...
@@ -1654,13 +1725,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1654,13 +1725,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
}
double
CudaCalcNonbondedForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
bool
includeDirect
,
bool
includeReciprocal
)
{
double
CudaCalcNonbondedForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
,
bool
includeDirect
,
bool
includeReciprocal
)
{
if
(
cosSinSums
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
cosSinSums
!=
NULL
&&
includeReciprocal
)
{
void
*
sumsArgs
[]
=
{
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
void
*
sumsArgs
[]
=
{
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
ewaldSumsKernel
,
sumsArgs
,
cosSinSums
->
getSize
());
cu
.
executeKernel
(
ewaldSumsKernel
,
sumsArgs
,
cosSinSums
->
getSize
());
void
*
forcesArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
void
*
forcesArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
}
if
(
directPmeGrid
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
if
(
directPmeGrid
!=
NULL
&&
includeReciprocal
)
{
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
...
@@ -1699,7 +1771,6 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1699,7 +1771,6 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
128
);
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
(),
128
);
}
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
if
(
dispersionCoefficient
!=
0.0
&&
includeDirect
)
{
double4
boxSize
=
cu
.
getPeriodicBoxSize
();
double4
boxSize
=
cu
.
getPeriodicBoxSize
();
energy
+=
dispersionCoefficient
/
(
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
);
energy
+=
dispersionCoefficient
/
(
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
);
...
...
platforms/cuda/src/CudaKernels.h
View file @
932a61b6
...
@@ -34,6 +34,7 @@
...
@@ -34,6 +34,7 @@
#include "CudaSort.h"
#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
#include "openmm/System.h"
#include "CpuPme.h"
#include <cufft.h>
#include <cufft.h>
namespace
OpenMM
{
namespace
OpenMM
{
...
@@ -557,7 +558,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
...
@@ -557,7 +558,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
,
cpuPme
(
NULL
),
pmeio
(
NULL
)
{
}
}
~
CudaCalcNonbondedForceKernel
();
~
CudaCalcNonbondedForceKernel
();
/**
/**
...
@@ -596,6 +597,9 @@ private:
...
@@ -596,6 +597,9 @@ private:
const
char
*
getMaxValue
()
const
{
return
"make_int2(INT_MAX, INT_MAX)"
;}
const
char
*
getMaxValue
()
const
{
return
"make_int2(INT_MAX, INT_MAX)"
;}
const
char
*
getSortKey
()
const
{
return
"value.y"
;}
const
char
*
getSortKey
()
const
{
return
"value.y"
;}
};
};
class
PmeIO
;
class
PmePreComputation
;
class
PmePostComputation
;
CudaContext
&
cu
;
CudaContext
&
cu
;
bool
hasInitializedFFT
;
bool
hasInitializedFFT
;
CudaArray
*
sigmaEpsilon
;
CudaArray
*
sigmaEpsilon
;
...
@@ -609,6 +613,8 @@ private:
...
@@ -609,6 +613,8 @@ private:
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
CudaSort
*
sort
;
CpuPme
*
cpuPme
;
PmeIO
*
pmeio
;
cufftHandle
fftForward
;
cufftHandle
fftForward
;
cufftHandle
fftBackward
;
cufftHandle
fftBackward
;
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldSumsKernel
;
...
...
platforms/cuda/src/kernels/pme.cu
View file @
932a61b6
...
@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
...
@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
}
}
}
}
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
forceBuffers
[
atom
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
x
*
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
0x100000000
));
forceBuffers
[
atom
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
x
*
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
0x100000000
));
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
y
*
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
0x100000000
));
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
y
*
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
0x100000000
));
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
z
*
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
0x100000000
));
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
q
*
force
.
z
*
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
0x100000000
));
}
}
}
}
extern
"C"
__global__
void
addForces
(
const
real4
*
__restrict__
forces
,
unsigned
long
long
*
__restrict__
forceBuffers
)
{
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
f
=
forces
[
atom
];
forceBuffers
[
atom
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
x
*
0x100000000
));
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
y
*
0x100000000
));
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
]
+=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
z
*
0x100000000
));
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment