Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1010df33
Commit
1010df33
authored
Feb 19, 2009
by
Peter Eastman
Browse files
Checking in Cuda implementation of explicit solvent
parent
df4b64cb
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1260 additions
and
1428 deletions
+1260
-1428
platforms/cuda/CMakeLists.txt
platforms/cuda/CMakeLists.txt
+97
-87
platforms/cuda/cudpp/mac/libcudpp.a
platforms/cuda/cudpp/mac/libcudpp.a
+0
-0
platforms/cuda/cudpp/mac/libcutil.a
platforms/cuda/cudpp/mac/libcutil.a
+0
-0
platforms/cuda/cudpp/win/cudpp.lib
platforms/cuda/cudpp/win/cudpp.lib
+0
-0
platforms/cuda/cudpp/win/cudpp32.dll
platforms/cuda/cudpp/win/cudpp32.dll
+0
-0
platforms/cuda/cudpp/win/cutil.lib
platforms/cuda/cudpp/win/cutil.lib
+0
-0
platforms/cuda/src/CudaStreamFactory.cpp
platforms/cuda/src/CudaStreamFactory.cpp
+9
-11
platforms/cuda/src/CudaStreamImpl.h
platforms/cuda/src/CudaStreamImpl.h
+14
-12
platforms/cuda/src/kernels/cudaKernels.h
platforms/cuda/src/kernels/cudaKernels.h
+4
-8
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+27
-2
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+567
-120
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+34
-9
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+93
-308
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+110
-374
platforms/cuda/src/kernels/kCalculateLocalForces.cu
platforms/cuda/src/kernels/kCalculateLocalForces.cu
+147
-41
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+76
-171
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+59
-279
platforms/cuda/src/kernels/kForces.cu
platforms/cuda/src/kernels/kForces.cu
+2
-2
platforms/cuda/src/kernels/kVerletUpdate.cu
platforms/cuda/src/kernels/kVerletUpdate.cu
+0
-3
platforms/reference/src/ReferenceKernels.cpp
platforms/reference/src/ReferenceKernels.cpp
+21
-1
No files found.
platforms/cuda/CMakeLists.txt
View file @
1010df33
...
@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
...
@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
IF
(
APPLE
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/mac
)
ELSE
(
APPLE
)
IF
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win
)
INSTALL_FILES
(
/lib FILES
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win/cudpp32.dll
)
ELSE
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/linux
)
ENDIF
(
WIN32
)
ENDIF
(
APPLE
)
SUBDIRS
(
sharedTarget staticTarget
)
SUBDIRS
(
sharedTarget staticTarget
)
platforms/cuda/cudpp/mac/libcudpp.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/mac/libcutil.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp32.dll
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cutil.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/src/CudaStreamFactory.cpp
View file @
1010df33
...
@@ -39,42 +39,40 @@
...
@@ -39,42 +39,40 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
StreamImpl
*
CudaStreamFactory
::
createStreamImpl
(
std
::
string
name
,
int
size
,
Stream
::
DataType
type
,
const
Platform
&
platform
,
OpenMMContextImpl
&
context
)
const
{
StreamImpl
*
CudaStreamFactory
::
createStreamImpl
(
std
::
string
name
,
int
size
,
Stream
::
DataType
type
,
const
Platform
&
platform
,
OpenMMContextImpl
&
context
)
const
{
if
(
name
==
"particlePositions"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
if
(
name
==
"particlePositions"
)
{
float
padding
[]
=
{
100000.0
f
,
100000.0
f
,
100000.0
f
,
0.2
f
};
float
padding
[]
=
{
100000.0
f
,
100000.0
f
,
100000.0
f
,
0.2
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psPosq4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psPosq4
,
4
,
padding
,
data
.
gpu
);
}
}
if
(
name
==
"particleVelocities"
)
{
if
(
name
==
"particleVelocities"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psVelm4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psVelm4
,
4
,
padding
,
data
.
gpu
);
}
}
if
(
name
==
"particleForces"
)
{
if
(
name
==
"particleForces"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psForce4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psForce4
,
4
,
padding
,
data
.
gpu
);
}
}
switch
(
type
)
{
switch
(
type
)
{
case
Stream
::
Float
:
case
Stream
::
Float
:
case
Stream
::
Double
:
case
Stream
::
Double
:
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float2
:
case
Stream
::
Float2
:
case
Stream
::
Double2
:
case
Stream
::
Double2
:
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float3
:
case
Stream
::
Float3
:
case
Stream
::
Double3
:
case
Stream
::
Double3
:
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float4
:
case
Stream
::
Float4
:
case
Stream
::
Double4
:
case
Stream
::
Double4
:
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer
:
case
Stream
::
Integer
:
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer2
:
case
Stream
::
Integer2
:
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer3
:
case
Stream
::
Integer3
:
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer4
:
case
Stream
::
Integer4
:
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
}
}
throw
OpenMMException
(
"Tried to create a Stream with an illegal DataType."
);
throw
OpenMMException
(
"Tried to create a Stream with an illegal DataType."
);
}
}
platforms/cuda/src/CudaStreamImpl.h
View file @
1010df33
...
@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
...
@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template
<
class
T
>
template
<
class
T
>
void
CudaStreamImpl
<
T
>::
loadFromArray
(
const
void
*
array
)
{
void
CudaStreamImpl
<
T
>::
loadFromArray
(
const
void
*
array
)
{
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
else
if
(
baseType
==
Stream
::
Double
)
{
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
else
{
else
{
int
*
arrayData
=
(
int
*
)
array
;
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
for
(
int
i
=
getSize
();
i
<
(
int
)
stream
->
_length
;
++
i
)
for
(
int
i
=
getSize
();
i
<
(
int
)
stream
->
_length
;
++
i
)
for
(
int
j
=
0
;
j
<
rowOffset
;
++
j
)
for
(
int
j
=
0
;
j
<
rowOffset
;
++
j
)
...
@@ -167,23 +168,24 @@ template <class T>
...
@@ -167,23 +168,24 @@ template <class T>
void
CudaStreamImpl
<
T
>::
saveToArray
(
void
*
array
)
{
void
CudaStreamImpl
<
T
>::
saveToArray
(
void
*
array
)
{
stream
->
Download
();
stream
->
Download
();
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
}
else
if
(
baseType
==
Stream
::
Double
)
{
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
}
else
{
else
{
int
*
arrayData
=
(
int
*
)
array
;
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
}
}
}
}
...
...
platforms/cuda/src/kernels/cudaKernels.h
View file @
1010df33
...
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
...
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kApplyFirstSettle
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kApplySecondSettle
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
...
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
...
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers
// Initializers
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
...
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
...
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetSettleSim
(
gpuContext
gpu
);
extern
void
GetSettleSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
...
...
platforms/cuda/src/kernels/cudatypes.h
View file @
1010df33
...
@@ -36,11 +36,12 @@
...
@@ -36,11 +36,12 @@
#include <limits>
#include <limits>
#include <iostream>
#include <iostream>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector_functions.h>
using
namespace
std
;
#define RTERROR(status, s) \
#define RTERROR(status, s) \
if
(
status
!=
cudaSuccess
)
{
\
if
(
status
!=
cudaSuccess
)
{
\
...
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
...
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
enum
CudaNonbondedMethod
{
NO_CUTOFF
,
CUTOFF
,
PERIODIC
};
struct
cudaGmxSimulation
{
struct
cudaGmxSimulation
{
// Constants
// Constants
...
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
...
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
interaction_blocks
;
// Number of blocks to launch when identifying interacting tiles
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
...
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
...
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
settle_threads_per_block
;
// Threads per block in SETTLE kernel calls
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
interaction_threads_per_block
;
// Threads per block when identifying interacting tiles
unsigned
int
workUnits
;
// Number of work units
unsigned
int
workUnits
;
// Number of work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
*
pInteractingWorkUnit
;
// Pointer to work units that have interactions
unsigned
int
*
pInteractionFlag
;
// Flags for which work units have interactions
size_t
*
pInteractionCount
;
// A count of the number of work units which have interactions
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
...
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
...
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned
int
outputBuffers
;
// Number of output buffers
unsigned
int
outputBuffers
;
// Number of output buffers
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
epsfac
;
// Epsilon factor for CDLJ calculations
float
epsfac
;
// Epsilon factor for CDLJ calculations
CudaNonbondedMethod
nonbondedMethod
;
// How to handle nonbonded interactions
float
nonbondedCutoffSqr
;
// Cutoff distance for CDLJ calculations
float
periodicBoxSizeX
;
// The X dimension of the periodic box
float
periodicBoxSizeY
;
// The Y dimension of the periodic box
float
periodicBoxSizeZ
;
// The Z dimension of the periodic box
float
reactionFieldK
;
// Constant for reaction field correction
float
probeRadius
;
// SASA probe radius
float
probeRadius
;
// SASA probe radius
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
electricConstant
;
// ACE approximation electric constant
float
electricConstant
;
// ACE approximation electric constant
...
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
...
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float
inverseTotalMass
;
// Used in linear momentum removal
float
inverseTotalMass
;
// Used in linear momentum removal
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
settleConstraints
;
// Total number of Settle constraints
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
...
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
...
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int
*
pNonShakeID
;
// Not Shaking atoms
int
*
pNonShakeID
;
// Not Shaking atoms
int4
*
pShakeID
;
// Shake atoms and phase
int4
*
pShakeID
;
// Shake atoms and phase
float4
*
pShakeParameter
;
// Shake parameters
float4
*
pShakeParameter
;
// Shake parameters
int4
*
pSettleID
;
// Settle atoms
float2
*
pSettleParameter
;
// Settle parameters
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
int
*
pAtomIndex
;
// The original index of each atom
float4
*
pGridBoundingBox
;
// The size of each grid cell
float4
*
pGridCenter
;
// The center of each grid cell
// Mutable stuff
// Mutable stuff
float4
*
pPosq
;
// Pointer to atom positions and charges
float4
*
pPosq
;
// Pointer to atom positions and charges
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
1010df33
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda.h>
#include <vector_functions.h>
#include <vector_functions.h>
#include <cstdlib>
#include <cstdlib>
...
@@ -40,6 +41,7 @@
...
@@ -40,6 +41,7 @@
#include <ctime>
#include <ctime>
#include <cmath>
#include <cmath>
#include <map>
#include <map>
#include <algorithm>
#ifdef WIN32
#ifdef WIN32
#include <windows.h>
#include <windows.h>
#else
#else
...
@@ -148,6 +150,23 @@ struct ShakeCluster {
...
@@ -148,6 +150,23 @@ struct ShakeCluster {
}
}
};
};
struct
Constraint
{
Constraint
(
int
atom1
,
int
atom2
,
float
distance2
)
:
atom1
(
atom1
),
atom2
(
atom2
),
distance2
(
distance2
)
{
}
int
atom1
,
atom2
;
float
distance2
;
};
struct
Molecule
{
vector
<
int
>
atoms
;
vector
<
int
>
bonds
;
vector
<
int
>
angles
;
vector
<
int
>
periodicTorsions
;
vector
<
int
>
rbTorsions
;
vector
<
int
>
constraints
;
};
static
const
float
dielectricOffset
=
0.009
f
;
static
const
float
dielectricOffset
=
0.009
f
;
static
const
float
PI
=
3.1415926535
f
;
static
const
float
PI
=
3.1415926535
f
;
static
const
float
probeRadius
=
0.14
f
;
static
const
float
probeRadius
=
0.14
f
;
...
@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
...
@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
}
}
}
}
cout
<<
total_exclusions
<<
" total exclusions.
\n
"
;
cout
<<
total_exclusions
<<
" total exclusions.
\n
"
;
gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions);
gpuSetCoulombParameters
(
gpu
,
epsfac
,
atom
,
c6
,
c12
,
q
,
symbol
,
exclusions
,
NO_CUTOFF
);
gpuSetObcParameters
(
gpu
,
defaultInnerDielectric
,
defaultSolventDielectric
,
atom
,
radius
,
scale
);
gpuSetObcParameters
(
gpu
,
defaultInnerDielectric
,
defaultSolventDielectric
,
atom
,
radius
,
scale
);
return
coulombs
;
return
coulombs
;
}
}
...
@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
...
@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
extern
"C"
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q
,
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q
,
const vector<char>& symbol, const vector<vector<int> >& exclusions)
const
vector
<
char
>&
symbol
,
const
vector
<
vector
<
int
>
>&
exclusions
,
CudaNonbondedMethod
method
)
{
{
unsigned
int
coulombs
=
atom
.
size
();
unsigned
int
coulombs
=
atom
.
size
();
gpu
->
sim
.
epsfac
=
epsfac
;
gpu
->
sim
.
epsfac
=
epsfac
;
unsigned int total_exclusions = 0;
gpu
->
sim
.
nonbondedMethod
=
method
;
gpu
->
exclusions
=
exclusions
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
{
...
@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
...
@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
p0
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
p0
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
p1
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
p1
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
p2
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
p2
;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
gpu->psPosq4->_pSysStream[0][i].w << " " <<
gpu->psSigEps2->_pSysStream[0][i].x << " " <<
gpu->psSigEps2->_pSysStream[0][i].y << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
exclusions;
#endif
for (int j = 0; j < (int) exclusions[i].size(); j++)
{
#if (DUMP_PARAMETERS == 1)
cout << " " << exclusions[i][j];
#endif
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
if (i >= (int) exclusions[i][j])
{
total_exclusions++;
}
}
#if (DUMP_PARAMETERS == 1)
cout << endl;
#endif
}
}
// Dummy out extra atom data
// Dummy out extra atom data
...
@@ -868,29 +860,23 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
...
@@ -868,29 +860,23 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
}
}
// Add in remaining exclusions
for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
{
for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
{
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
}
}
gpu
->
psPosq4
->
Upload
();
gpu
->
psPosq4
->
Upload
();
gpu
->
psSigEps2
->
Upload
();
gpu
->
psSigEps2
->
Upload
();
}
// Check for exclusion consistency
extern
"C"
for (unsigned int i = 0; i < coulombs; i++)
void
gpuSetNonbondedCutoff
(
gpuContext
gpu
,
float
cutoffDistance
,
float
solventDielectric
)
{
{
for (unsigned int j = i; j < coulombs; j++)
gpu
->
sim
.
nonbondedCutoffSqr
=
cutoffDistance
*
cutoffDistance
;
{
gpu
->
sim
.
reactionFieldK
=
pow
(
cutoffDistance
,
-
3.0
f
)
*
(
solventDielectric
-
1.0
f
)
/
(
2.0
f
*
solventDielectric
+
1.0
f
);
}
if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i])
extern
"C"
cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl;
void
gpuSetPeriodicBoxSize
(
gpuContext
gpu
,
float
xsize
,
float
ysize
,
float
zsize
)
}
{
}
gpu
->
sim
.
periodicBoxSizeX
=
xsize
;
gpu
->
sim
.
periodicBoxSizeY
=
ysize
;
gpu
->
sim
.
periodicBoxSizeZ
=
zsize
;
}
}
extern
"C"
extern
"C"
...
@@ -898,6 +884,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
...
@@ -898,6 +884,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
{
{
unsigned
int
atoms
=
atom
.
size
();
unsigned
int
atoms
=
atom
.
size
();
gpu
->
bIncludeGBSA
=
true
;
for
(
unsigned
int
i
=
0
;
i
<
atoms
;
i
++
)
for
(
unsigned
int
i
=
0
;
i
<
atoms
;
i
++
)
{
{
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
radius
[
i
]
-
dielectricOffset
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
radius
[
i
]
-
dielectricOffset
;
...
@@ -974,10 +961,90 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
...
@@ -974,10 +961,90 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
constraintCount
[
atom2
[
i
]]
++
;
constraintCount
[
atom2
[
i
]]
++
;
}
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
vector
<
map
<
int
,
float
>
>
settleConstraints
(
gpu
->
natoms
);
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
if
(
constraintCount
[
atom1
[
i
]]
==
2
&&
constraintCount
[
atom2
[
i
]]
==
2
)
{
settleConstraints
[
atom1
[
i
]][
atom2
[
i
]]
=
distance
[
i
];
settleConstraints
[
atom2
[
i
]][
atom1
[
i
]]
=
distance
[
i
];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector
<
int
>
settleClusters
;
for
(
int
i
=
0
;
i
<
settleConstraints
.
size
();
i
++
)
{
if
(
settleConstraints
[
i
].
size
()
==
2
)
{
int
partner1
=
settleConstraints
[
i
].
begin
()
->
first
;
int
partner2
=
(
++
settleConstraints
[
i
].
begin
())
->
first
;
if
(
settleConstraints
[
partner1
].
size
()
!=
2
||
settleConstraints
[
partner2
].
size
()
!=
2
||
settleConstraints
[
partner1
].
find
(
partner2
)
==
settleConstraints
[
partner1
].
end
())
settleConstraints
[
i
].
clear
();
else
if
(
i
<
partner1
&&
i
<
partner2
)
settleClusters
.
push_back
(
i
);
}
else
settleConstraints
[
i
].
clear
();
}
// Record the actual SETTLE clusters.
CUDAStream
<
int4
>*
psSettleID
=
new
CUDAStream
<
int4
>
((
int
)
settleClusters
.
size
(),
1
);
gpu
->
psSettleID
=
psSettleID
;
gpu
->
sim
.
pSettleID
=
psSettleID
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psSettleParameter
=
new
CUDAStream
<
float2
>
((
int
)
settleClusters
.
size
(),
1
);
gpu
->
psSettleParameter
=
psSettleParameter
;
gpu
->
sim
.
pSettleParameter
=
psSettleParameter
->
_pDevStream
[
0
];
gpu
->
sim
.
settleConstraints
=
settleClusters
.
size
();
for
(
int
i
=
0
;
i
<
settleClusters
.
size
();
i
++
)
{
int
atom1
=
settleClusters
[
i
];
int
atom2
=
settleConstraints
[
atom1
].
begin
()
->
first
;
int
atom3
=
(
++
settleConstraints
[
atom1
].
begin
())
->
first
;
float
dist12
=
settleConstraints
[
atom1
].
find
(
atom2
)
->
second
;
float
dist13
=
settleConstraints
[
atom1
].
find
(
atom3
)
->
second
;
float
dist23
=
settleConstraints
[
atom2
].
find
(
atom3
)
->
second
;
if
(
dist12
==
dist13
)
{
// atom1 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom1
;
psSettleID
->
_pSysData
[
i
].
y
=
atom2
;
psSettleID
->
_pSysData
[
i
].
z
=
atom3
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist12
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist23
;
}
else
if
(
dist12
==
dist23
)
{
// atom2 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom2
;
psSettleID
->
_pSysData
[
i
].
y
=
atom1
;
psSettleID
->
_pSysData
[
i
].
z
=
atom3
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist12
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist13
;
}
else
if
(
dist13
==
dist23
)
{
// atom3 is the central atom
psSettleID
->
_pSysData
[
i
].
x
=
atom3
;
psSettleID
->
_pSysData
[
i
].
y
=
atom1
;
psSettleID
->
_pSysData
[
i
].
z
=
atom2
;
psSettleParameter
->
_pSysData
[
i
].
x
=
dist13
;
psSettleParameter
->
_pSysData
[
i
].
y
=
dist12
;
}
else
throw
OpenMMException
(
"Two of the three distances constrained with SETTLE must be the same."
);
}
psSettleID
->
Upload
();
psSettleParameter
->
Upload
();
gpu
->
sim
.
settle_threads_per_block
=
(
gpu
->
sim
.
settleConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
settle_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
settle_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
settle_threads_per_block
<
1
)
gpu
->
sim
.
settle_threads_per_block
=
1
;
// Find clusters consisting of a central atom with up to three peripheral atoms.
// Find clusters consisting of a central atom with up to three peripheral atoms.
map
<
int
,
ShakeCluster
>
clusters
;
map
<
int
,
ShakeCluster
>
clusters
;
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
if
(
settleConstraints
[
atom1
[
i
]].
size
()
==
2
)
continue
;
// This is being taken care of with SETTLE.
// Determine which is the central atom.
// Determine which is the central atom.
bool
firstIsCentral
;
bool
firstIsCentral
;
...
@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
...
@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu
->
sim
.
degreesOfFreedom
=
3
*
gpu
->
sim
.
atoms
-
6
;
gpu
->
sim
.
degreesOfFreedom
=
3
*
gpu
->
sim
.
atoms
-
6
;
gpu
->
gpAtomTable
=
NULL
;
gpu
->
gpAtomTable
=
NULL
;
gpu
->
gAtomTypes
=
0
;
gpu
->
gAtomTypes
=
0
;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
gpu->sim.totalNonbondOutputBuffers = 2 * gpu->sim.nonbondOutputBuffers;
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
gpu
->
psPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
psPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
gpu
->
sim
.
stride
*
2
;
gpu
->
sim
.
stride2
=
gpu
->
sim
.
stride
*
2
;
...
@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
...
@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu
->
psObcData
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
psObcData
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcData
=
gpu
->
psObcData
->
_pDevStream
[
0
];
gpu
->
sim
.
pObcData
=
gpu
->
psObcData
->
_pDevStream
[
0
];
gpu
->
pAtomSymbol
=
new
unsigned
char
[
gpu
->
natoms
];
gpu
->
pAtomSymbol
=
new
unsigned
char
[
gpu
->
natoms
];
gpu
->
psAtomIndex
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pAtomIndex
=
gpu
->
psAtomIndex
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
gpu
->
psAtomIndex
->
_pSysStream
[
0
][
i
]
=
i
;
gpu
->
psAtomIndex
->
Upload
();
// Determine randoms
// Determine randoms
gpu
->
seed
=
(
unsigned
long
)
time
(
NULL
)
&
0x000fffff
;
gpu
->
seed
=
(
unsigned
long
)
time
(
NULL
)
&
0x000fffff
;
gpu->sim.randomFrames =
9
95;
gpu
->
sim
.
randomFrames
=
95
;
gpu
->
sim
.
randomIterations
=
gpu
->
sim
.
randomFrames
;
gpu
->
sim
.
randomIterations
=
gpu
->
sim
.
randomFrames
;
gpu
->
sim
.
randoms
=
gpu
->
sim
.
randomFrames
*
gpu
->
sim
.
paddedNumberOfAtoms
-
5
*
GRID
;
gpu
->
sim
.
randoms
=
gpu
->
sim
.
randomFrames
*
gpu
->
sim
.
paddedNumberOfAtoms
-
5
*
GRID
;
gpu
->
sim
.
totalRandoms
=
gpu
->
sim
.
randoms
+
gpu
->
sim
.
paddedNumberOfAtoms
;
gpu
->
sim
.
totalRandoms
=
gpu
->
sim
.
randoms
+
gpu
->
sim
.
paddedNumberOfAtoms
;
...
@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
...
@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
{
{
sscanf
(
pAdapter
,
"%d"
,
&
device
);
sscanf
(
pAdapter
,
"%d"
,
&
device
);
}
}
cudaError_t status = cudaSetDevice(device);
//
cudaError_t status = cudaSetDevice(device);
RTERROR(status, "Error setting CUDA device")
//
RTERROR(status, "Error setting CUDA device")
// Determine which core to run on
// Determine which core to run on
#if 0
#if 0
...
@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
...
@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
gpu
->
sim
.
probeRadius
=
probeRadius
;
gpu
->
sim
.
probeRadius
=
probeRadius
;
gpu
->
sim
.
surfaceAreaFactor
=
surfaceAreaFactor
;
gpu
->
sim
.
surfaceAreaFactor
=
surfaceAreaFactor
;
gpu
->
sim
.
electricConstant
=
electricConstant
;
gpu
->
sim
.
electricConstant
=
electricConstant
;
gpu
->
sim
.
nonbondedMethod
=
NO_CUTOFF
;
gpu
->
sim
.
nonbondedCutoffSqr
=
0.0
f
;
gpu
->
sim
.
bigFloat
=
99999999.0
f
;
gpu
->
sim
.
bigFloat
=
99999999.0
f
;
gpu
->
sim
.
forceConversionFactor
=
forceConversionFactor
;
gpu
->
sim
.
forceConversionFactor
=
forceConversionFactor
;
...
@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
...
@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
gpu
->
bCalculateCM
=
false
;
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
false
;
gpu
->
bRemoveCM
=
false
;
gpu
->
bRecalculateBornRadii
=
true
;
gpu
->
bRecalculateBornRadii
=
true
;
gpu
->
bIncludeGBSA
=
false
;
gpuInitializeRandoms
(
gpu
);
gpuInitializeRandoms
(
gpu
);
// To be determined later
// To be determined later
...
@@ -1489,19 +1560,21 @@ void* gpuInit(int numAtoms)
...
@@ -1489,19 +1560,21 @@ void* gpuInit(int numAtoms)
gpu
->
psLJ14Parameter
=
NULL
;
gpu
->
psLJ14Parameter
=
NULL
;
gpu
->
psShakeID
=
NULL
;
gpu
->
psShakeID
=
NULL
;
gpu
->
psShakeParameter
=
NULL
;
gpu
->
psShakeParameter
=
NULL
;
gpu
->
psSettleID
=
NULL
;
gpu
->
psSettleParameter
=
NULL
;
gpu
->
psExclusion
=
NULL
;
gpu
->
psExclusion
=
NULL
;
gpu
->
psWorkUnit
=
NULL
;
gpu
->
psWorkUnit
=
NULL
;
gpu
->
psInteractingWorkUnit
=
NULL
;
gpu
->
psInteractionFlag
=
NULL
;
gpu
->
psInteractionCount
=
NULL
;
gpu
->
psGridBoundingBox
=
NULL
;
gpu
->
psGridCenter
=
NULL
;
// Initialize output buffer before reading parameters
// Initialize output buffer before reading parameters
gpu
->
pOutputBufferCounter
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
];
gpu
->
pOutputBufferCounter
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
];
memset
(
gpu
->
pOutputBufferCounter
,
0
,
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
unsigned
int
));
memset
(
gpu
->
pOutputBufferCounter
,
0
,
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
unsigned
int
));
// Initialize exclusion array
gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
gpu->pExclusion[i] = 1;
return
(
void
*
)
gpu
;
return
(
void
*
)
gpu
;
}
}
...
@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
...
@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
{
{
// Delete sysmem pointers
// Delete sysmem pointers
delete
[]
gpu
->
pOutputBufferCounter
;
delete
[]
gpu
->
pOutputBufferCounter
;
delete[] gpu->pExclusion;
delete
[]
gpu
->
gpAtomTable
;
delete
[]
gpu
->
gpAtomTable
;
delete
[]
gpu
->
pAtomSymbol
;
delete
[]
gpu
->
pAtomSymbol
;
...
@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
...
@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
delete
gpu
->
psLJ14Parameter
;
delete
gpu
->
psLJ14Parameter
;
delete
gpu
->
psShakeID
;
delete
gpu
->
psShakeID
;
delete
gpu
->
psShakeParameter
;
delete
gpu
->
psShakeParameter
;
delete
gpu
->
psSettleID
;
delete
gpu
->
psSettleParameter
;
delete
gpu
->
psExclusion
;
delete
gpu
->
psExclusion
;
delete
gpu
->
psWorkUnit
;
delete
gpu
->
psWorkUnit
;
delete
gpu
->
psInteractingWorkUnit
;
delete
gpu
->
psInteractionFlag
;
delete
gpu
->
psInteractionCount
;
delete
gpu
->
psRandom4
;
delete
gpu
->
psRandom4
;
delete
gpu
->
psRandom2
;
delete
gpu
->
psRandom2
;
delete
gpu
->
psRandomPosition
;
delete
gpu
->
psRandomPosition
;
delete
gpu
->
psRandomSeed
;
delete
gpu
->
psRandomSeed
;
delete
gpu
->
psLinearMomentum
;
delete
gpu
->
psLinearMomentum
;
delete
gpu
->
psAtomIndex
;
delete
gpu
->
psGridBoundingBox
;
delete
gpu
->
psGridCenter
;
if
(
gpu
->
cudpp
!=
0
)
cudppDestroyPlan
(
gpu
->
cudpp
);
// Wrap up
// Wrap up
delete
gpu
;
delete
gpu
;
...
@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
...
@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
extern
"C"
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
)
int
gpuBuildOutputBuffers
(
gpuContext
gpu
)
{
{
// Select the number of output buffer to use.
gpu
->
bOutputBufferPerWarp
=
true
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
if
(
gpu
->
sim
.
nonbondOutputBuffers
>=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
)
{
// For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
gpu
->
bOutputBufferPerWarp
=
false
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
;
}
gpu
->
sim
.
totalNonbondOutputBuffers
=
(
gpu
->
bIncludeGBSA
?
2
*
gpu
->
sim
.
nonbondOutputBuffers
:
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
sim
.
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
unsigned
int
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
unsigned
int
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
{
...
@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
...
@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
const
unsigned
int
cells
=
dim
*
(
dim
+
1
)
/
2
;
const
unsigned
int
cells
=
dim
*
(
dim
+
1
)
/
2
;
const unsigned int* pExclusion = gpu->pExclusion;
CUDAStream
<
unsigned
int
>*
psWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
CUDAStream
<
unsigned
int
>*
psWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
unsigned
int
*
pWorkList
=
psWorkUnit
->
_pSysStream
[
0
];
unsigned
int
*
pWorkList
=
psWorkUnit
->
_pSysStream
[
0
];
gpu
->
psWorkUnit
=
psWorkUnit
;
gpu
->
psWorkUnit
=
psWorkUnit
;
gpu
->
sim
.
pWorkUnit
=
psWorkUnit
->
_pDevStream
[
0
];
gpu
->
sim
.
pWorkUnit
=
psWorkUnit
->
_pDevStream
[
0
];
CUDAStream
<
unsigned
int
>*
psInteractingWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
gpu
->
psInteractingWorkUnit
=
psInteractingWorkUnit
;
gpu
->
sim
.
pInteractingWorkUnit
=
psInteractingWorkUnit
->
_pDevStream
[
0
];
CUDAStream
<
unsigned
int
>*
psInteractionFlag
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
gpu
->
psInteractionFlag
=
psInteractionFlag
;
gpu
->
sim
.
pInteractionFlag
=
psInteractionFlag
->
_pDevStream
[
0
];
CUDAStream
<
size_t
>*
psInteractionCount
=
new
CUDAStream
<
size_t
>
(
1
,
1u
);
gpu
->
psInteractionCount
=
psInteractionCount
;
gpu
->
sim
.
pInteractionCount
=
psInteractionCount
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psGridBoundingBox
=
new
CUDAStream
<
float4
>
(
dim
,
1u
);
gpu
->
psGridBoundingBox
=
psGridBoundingBox
;
gpu
->
sim
.
pGridBoundingBox
=
psGridBoundingBox
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psGridCenter
=
new
CUDAStream
<
float4
>
(
dim
,
1u
);
gpu
->
psGridCenter
=
psGridCenter
;
gpu
->
sim
.
pGridCenter
=
psGridCenter
->
_pDevStream
[
0
];
gpu
->
sim
.
nonbond_workBlock
=
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
gpu
->
sim
.
nonbond_workBlock
=
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
workUnits
=
cells
;
gpu
->
sim
.
workUnits
=
cells
;
// In
crease block count if necessary for extra large molecules that would
// In
itialize the CUDPP workspace.
// otherwise overflow the SM workunit buffers
gpu
->
cudpp
=
0
;
int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM
;
CUDPPConfiguration
config
;
if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
config
.
datatype
=
CUDPP_UINT
;
{
config
.
algorithm
=
CUDPP_COMPACT
;
gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks)
;
config
.
options
=
CUDPP_OPTION_FORWARD
;
}
CUDPPResult
result
=
cudppPlan
(
&
gpu
->
cudpp
,
config
,
cells
,
1
,
0
);
if (
(int) gpu->sim.bornForce2_blocks < minimumBlocks
)
if
(
CUDPP_SUCCESS
!=
result
)
{
{
gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
printf
(
"Error initializing CUDPP: %d
\n
"
,
result
);
exit
(
-
1
);
}
}
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
// if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
// {
// gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
// }
// if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// {
// gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
// }
gpu
->
sim
.
nbWorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nbWorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nbWorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nbWorkUnitsPerBlock
;
gpu
->
sim
.
nbWorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nbWorkUnitsPerBlock
;
gpu
->
sim
.
bf2WorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bf2WorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bf2WorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bf2WorkUnitsPerBlock
;
gpu
->
sim
.
bf2WorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bf2WorkUnitsPerBlock
;
gpu
->
sim
.
interaction_threads_per_block
=
64
;
gpu
->
sim
.
interaction_blocks
=
(
gpu
->
sim
.
workUnits
+
gpu
->
sim
.
interaction_threads_per_block
-
1
)
/
gpu
->
sim
.
interaction_threads_per_block
;
// Decrease thread count for extra small molecules to spread computation
// Decrease thread count for extra small molecules to spread computation
// across entire chip
// across entire chip
...
@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
...
@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
for
(
unsigned
int
x
=
y
;
x
<
dim
;
x
++
)
for
(
unsigned
int
x
=
y
;
x
<
dim
;
x
++
)
{
{
pWorkList
[
count
]
=
(
x
<<
17
)
|
(
y
<<
2
);
pWorkList
[
count
]
=
(
x
<<
17
)
|
(
y
<<
2
);
// Check for exclusions
int exclusions = 0;
for (unsigned int i = y * grid; i < y * grid + grid; i++)
{
for (unsigned int j = x * grid; j < x * grid + grid; j++)
{
if (!pExclusion[i * atoms + j])
{
exclusions++;
}
}
}
// Signal exclusions if they exist
if (exclusions > 0)
pWorkList[count] |= 0x1;
count
++
;
count
++
;
}
}
}
}
...
@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
...
@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
}
}
extern
"C"
extern
"C"
int
gpuBuildExclusionList(gpuContext gpu)
void
gpuBuildExclusionList
(
gpuContext
gpu
)
{
{
unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u);
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
CUDAStream
<
unsigned
int
>*
psExclusion
=
new
CUDAStream
<
unsigned
int
>
((
atoms
*
atoms
+
grid
-
1
)
/
grid
,
1u
);
gpu
->
psExclusion
=
psExclusion
;
gpu
->
psExclusion
=
psExclusion
;
gpu
->
sim
.
pExclusion
=
psExclusion
->
_pDevStream
[
0
];
gpu
->
sim
.
pExclusion
=
psExclusion
->
_pDevStream
[
0
];
unsigned
int
*
pExList
=
psExclusion
->
_pSysStream
[
0
];
unsigned
int
*
pExList
=
psExclusion
->
_pSysStream
[
0
];
int exclusions = 0;
unsigned
int
*
pWorkList
=
gpu
->
psWorkUnit
->
_pSysStream
[
0
];
unsigned int pos = 0;
for
(
int
i
=
0
;
i
<
psExclusion
->
_length
;
++
i
)
pExList
[
i
]
=
0xFFFFFFFF
;
for (unsigned int x = 0; x < atoms; x += gpu->grid)
// Fill in the exclusions.
{
for (unsigned int y = 0; y < atoms; y += gpu->grid)
for
(
int
atom1
=
0
;
atom1
<
gpu
->
exclusions
.
size
();
++
atom1
)
{
for (unsigned x1 = x; x1 < x + gpu->grid; x1++)
{
unsigned int mask = 0;
for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++)
{
{
mask >>= 1;
int
x
=
atom1
/
grid
;
if (gpu->pExclusion[x1 * atoms + y1] == 0)
int
offset
=
atom1
-
x
*
grid
;
for
(
int
j
=
0
;
j
<
gpu
->
exclusions
[
atom1
].
size
();
++
j
)
{
{
if (x1 >= y1)
int
atom2
=
gpu
->
exclusions
[
atom1
][
j
];
exclusions++;
int
y
=
atom2
/
grid
;
}
int
index
=
x
*
atoms
+
y
*
grid
+
offset
;
else
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom2
-
y
*
grid
));
mask |= 0x80000000;
int
cell
=
(
x
>
y
?
x
+
y
*
dim
-
y
*
(
y
+
1
)
/
2
:
y
+
x
*
dim
-
x
*
(
x
+
1
)
/
2
);
pWorkList
[
cell
]
|=
1
;
}
}
pExList[pos++] = mask;
}
}
// Mark all interactions that involve a padding atom as being excluded.
for
(
int
atom1
=
gpu
->
natoms
;
atom1
<
atoms
;
++
atom1
)
{
int
x
=
atom1
/
grid
;
int
offset1
=
atom1
-
x
*
grid
;
for
(
int
atom2
=
0
;
atom2
<
atoms
;
++
atom2
)
{
int
y
=
atom2
/
grid
;
int
index
=
x
*
atoms
+
y
*
grid
+
offset1
;
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom2
-
y
*
grid
));
int
offset2
=
atom2
-
y
*
grid
;
index
=
y
*
atoms
+
x
*
grid
+
offset2
;
pExList
[
index
]
&=
0xFFFFFFFF
-
(
1
<<
(
atom1
-
x
*
grid
));
int
cell
=
(
x
>
y
?
x
+
y
*
dim
-
y
*
(
y
+
1
)
/
2
:
y
+
x
*
dim
-
x
*
(
x
+
1
)
/
2
);
pWorkList
[
cell
]
|=
1
;
}
}
}
}
psExclusion
->
Upload
();
psExclusion
->
Upload
();
gpu
->
psWorkUnit
->
Upload
();
gpuSetConstants
(
gpu
);
gpuSetConstants
(
gpu
);
return exclusions;
}
}
extern
"C"
extern
"C"
...
@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
...
@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
SetUpdateShakeHSim
(
gpu
);
SetUpdateShakeHSim
(
gpu
);
SetVerletUpdateSim
(
gpu
);
SetVerletUpdateSim
(
gpu
);
SetBrownianUpdateSim
(
gpu
);
SetBrownianUpdateSim
(
gpu
);
SetSettleSim
(
gpu
);
SetRandomSim
(
gpu
);
SetRandomSim
(
gpu
);
if
(
gpu
->
sm_version
>=
SM_12
)
if
(
gpu
->
sm_version
>=
SM_12
)
{
{
SetCalculateCDLJForces_12Sim(gpu);
SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces2_12Sim(gpu);
}
}
return
1
;
return
1
;
...
@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
...
@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
);
);
}
}
}
}
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
vector
<
int
>&
atomMolecule
,
vector
<
vector
<
int
>
>&
atomBonds
)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule
[
atom
]
=
molecule
;
for
(
int
i
=
0
;
i
<
atomBonds
[
atom
].
size
();
i
++
)
if
(
atomMolecule
[
atomBonds
[
atom
][
i
]]
==
-
1
)
tagAtomsInMolecule
(
atomBonds
[
atom
][
i
],
molecule
,
atomMolecule
,
atomBonds
);
}
static
void
findMoleculeGroups
(
gpuContext
gpu
)
{
// First make a list of constraints for future use.
vector
<
Constraint
>
constraints
;
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
ShakeConstraints
;
i
++
)
{
int
atom1
=
gpu
->
psShakeID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psShakeID
->
_pSysData
[
i
].
y
;
int
atom3
=
gpu
->
psShakeID
->
_pSysData
[
i
].
z
;
int
atom4
=
gpu
->
psShakeID
->
_pSysData
[
i
].
w
;
float
distance2
=
gpu
->
psShakeParameter
->
_pSysData
[
i
].
z
;
constraints
.
push_back
(
Constraint
(
atom1
,
atom2
,
distance2
));
if
(
atom3
!=
-
1
)
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance2
));
if
(
atom4
!=
-
1
)
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance2
));
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
settleConstraints
;
i
++
)
{
int
atom1
=
gpu
->
psSettleID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psSettleID
->
_pSysData
[
i
].
y
;
int
atom3
=
gpu
->
psSettleID
->
_pSysData
[
i
].
z
;
float
distance12
=
gpu
->
psSettleParameter
->
_pSysData
[
i
].
x
;
float
distance23
=
gpu
->
psSettleParameter
->
_pSysData
[
i
].
y
;
constraints
.
push_back
(
Constraint
(
atom1
,
atom2
,
distance12
*
distance12
));
constraints
.
push_back
(
Constraint
(
atom1
,
atom3
,
distance12
*
distance12
));
constraints
.
push_back
(
Constraint
(
atom2
,
atom3
,
distance23
*
distance23
));
}
// First make a list of every other atom to which each atom is connect by a bond or constraint.
int
numAtoms
=
gpu
->
natoms
;
vector
<
vector
<
int
>
>
atomBonds
(
numAtoms
);
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bonds
;
i
++
)
{
int
atom1
=
gpu
->
psBondID
->
_pSysData
[
i
].
x
;
int
atom2
=
gpu
->
psBondID
->
_pSysData
[
i
].
y
;
atomBonds
[
atom1
].
push_back
(
atom2
);
atomBonds
[
atom2
].
push_back
(
atom1
);
}
for
(
int
i
=
0
;
i
<
constraints
.
size
();
i
++
)
{
int
atom1
=
constraints
[
i
].
atom1
;
int
atom2
=
constraints
[
i
].
atom2
;
atomBonds
[
atom1
].
push_back
(
atom2
);
atomBonds
[
atom2
].
push_back
(
atom1
);
}
// Now tag atoms by which molecule they belong to.
vector
<
int
>
atomMolecule
(
numAtoms
,
-
1
);
int
numMolecules
=
0
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
if
(
atomMolecule
[
i
]
==
-
1
)
tagAtomsInMolecule
(
i
,
numMolecules
++
,
atomMolecule
,
atomBonds
);
vector
<
vector
<
int
>
>
atomIndices
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
atomIndices
[
atomMolecule
[
i
]].
push_back
(
i
);
// Construct a description of each molecule.
vector
<
Molecule
>
molecules
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
molecules
[
i
].
atoms
=
atomIndices
[
i
];
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bonds
;
i
++
)
{
int
atom1
=
gpu
->
psBondID
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
bonds
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
bond_angles
;
i
++
)
{
int
atom1
=
gpu
->
psBondAngleID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
angles
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
dihedrals
;
i
++
)
{
int
atom1
=
gpu
->
psDihedralID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
periodicTorsions
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
gpu
->
sim
.
rb_dihedrals
;
i
++
)
{
int
atom1
=
gpu
->
psRbDihedralID1
->
_pSysData
[
i
].
x
;
molecules
[
atomMolecule
[
atom1
]].
rbTorsions
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
constraints
.
size
();
i
++
)
{
molecules
[
atomMolecule
[
constraints
[
i
].
atom1
]].
constraints
.
push_back
(
i
);
}
// Sort them into groups of identical molecules.
vector
<
Molecule
>
uniqueMolecules
;
vector
<
vector
<
int
>
>
moleculeInstances
;
for
(
int
molIndex
=
0
;
molIndex
<
molecules
.
size
();
molIndex
++
)
{
Molecule
&
mol
=
molecules
[
molIndex
];
// See if it is identical to another molecule.
bool
isNew
=
true
;
for
(
int
j
=
0
;
j
<
uniqueMolecules
.
size
()
&&
isNew
;
j
++
)
{
Molecule
&
mol2
=
uniqueMolecules
[
j
];
bool
identical
=
true
;
if
(
mol
.
atoms
.
size
()
!=
mol2
.
atoms
.
size
()
||
mol
.
bonds
.
size
()
!=
mol2
.
bonds
.
size
()
||
mol
.
angles
.
size
()
!=
mol2
.
angles
.
size
()
||
mol
.
periodicTorsions
.
size
()
!=
mol2
.
periodicTorsions
.
size
()
||
mol
.
rbTorsions
.
size
()
!=
mol2
.
rbTorsions
.
size
()
||
mol
.
constraints
.
size
()
!=
mol2
.
constraints
.
size
())
identical
=
false
;
int
atomOffset
=
mol2
.
atoms
[
0
]
-
mol
.
atoms
[
0
];
float4
*
posq
=
gpu
->
psPosq4
->
_pSysData
;
float4
*
velm
=
gpu
->
psVelm4
->
_pSysData
;
float2
*
sigeps
=
gpu
->
psSigEps2
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
atoms
.
size
()
&&
identical
;
i
++
)
if
(
mol
.
atoms
[
i
]
!=
mol2
.
atoms
[
i
]
-
atomOffset
||
posq
[
mol
.
atoms
[
i
]].
w
!=
posq
[
mol2
.
atoms
[
i
]].
w
||
velm
[
mol
.
atoms
[
i
]].
w
!=
velm
[
mol2
.
atoms
[
i
]].
w
||
sigeps
[
mol
.
atoms
[
i
]].
x
!=
sigeps
[
mol2
.
atoms
[
i
]].
x
||
sigeps
[
mol
.
atoms
[
i
]].
y
!=
sigeps
[
mol2
.
atoms
[
i
]].
y
)
identical
=
false
;
int4
*
bondID
=
gpu
->
psBondID
->
_pSysData
;
float2
*
bondParam
=
gpu
->
psBondParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
bonds
.
size
()
&&
identical
;
i
++
)
if
(
bondID
[
mol
.
bonds
[
i
]].
x
!=
bondID
[
mol2
.
bonds
[
i
]].
x
-
atomOffset
||
bondID
[
mol
.
bonds
[
i
]].
y
!=
bondID
[
mol2
.
bonds
[
i
]].
y
-
atomOffset
||
bondParam
[
mol
.
bonds
[
i
]].
x
!=
bondParam
[
mol2
.
bonds
[
i
]].
x
||
bondParam
[
mol
.
bonds
[
i
]].
y
!=
bondParam
[
mol2
.
bonds
[
i
]].
y
)
identical
=
false
;
int4
*
angleID
=
gpu
->
psBondAngleID1
->
_pSysData
;
float2
*
angleParam
=
gpu
->
psBondAngleParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
angles
.
size
()
&&
identical
;
i
++
)
if
(
angleID
[
mol
.
angles
[
i
]].
x
!=
angleID
[
mol2
.
angles
[
i
]].
x
-
atomOffset
||
angleID
[
mol
.
angles
[
i
]].
y
!=
angleID
[
mol2
.
angles
[
i
]].
y
-
atomOffset
||
angleID
[
mol
.
angles
[
i
]].
z
!=
angleID
[
mol2
.
angles
[
i
]].
z
-
atomOffset
||
angleParam
[
mol
.
angles
[
i
]].
x
!=
angleParam
[
mol2
.
angles
[
i
]].
x
||
angleParam
[
mol
.
angles
[
i
]].
y
!=
angleParam
[
mol2
.
angles
[
i
]].
y
)
identical
=
false
;
int4
*
periodicID
=
gpu
->
psDihedralID1
->
_pSysData
;
float4
*
periodicParam
=
gpu
->
psDihedralParameter
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
periodicTorsions
.
size
()
&&
identical
;
i
++
)
if
(
periodicID
[
mol
.
periodicTorsions
[
i
]].
x
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
x
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
y
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
y
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
z
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
z
-
atomOffset
||
periodicID
[
mol
.
periodicTorsions
[
i
]].
w
!=
periodicID
[
mol2
.
periodicTorsions
[
i
]].
w
-
atomOffset
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
x
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
x
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
y
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
y
||
periodicParam
[
mol
.
periodicTorsions
[
i
]].
z
!=
periodicParam
[
mol2
.
periodicTorsions
[
i
]].
z
)
identical
=
false
;
int4
*
rbID
=
gpu
->
psRbDihedralID1
->
_pSysData
;
float4
*
rbParam1
=
gpu
->
psRbDihedralParameter1
->
_pSysData
;
float2
*
rbParam2
=
gpu
->
psRbDihedralParameter2
->
_pSysData
;
for
(
int
i
=
0
;
i
<
mol
.
rbTorsions
.
size
()
&&
identical
;
i
++
)
if
(
rbID
[
mol
.
rbTorsions
[
i
]].
x
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
x
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
y
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
y
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
z
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
z
-
atomOffset
||
rbID
[
mol
.
rbTorsions
[
i
]].
w
!=
rbID
[
mol2
.
rbTorsions
[
i
]].
w
-
atomOffset
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
x
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
x
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
y
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
y
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
z
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
z
||
rbParam1
[
mol
.
rbTorsions
[
i
]].
w
!=
rbParam1
[
mol2
.
rbTorsions
[
i
]].
w
||
rbParam2
[
mol
.
rbTorsions
[
i
]].
x
!=
rbParam2
[
mol2
.
rbTorsions
[
i
]].
x
||
rbParam2
[
mol
.
rbTorsions
[
i
]].
y
!=
rbParam2
[
mol2
.
rbTorsions
[
i
]].
y
)
identical
=
false
;
for
(
int
i
=
0
;
i
<
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
if
(
constraints
[
mol
.
constraints
[
i
]].
atom1
!=
constraints
[
mol2
.
constraints
[
i
]].
atom1
-
atomOffset
||
constraints
[
mol
.
constraints
[
i
]].
atom2
!=
constraints
[
mol2
.
constraints
[
i
]].
atom2
-
atomOffset
||
constraints
[
mol
.
constraints
[
i
]].
distance2
!=
constraints
[
mol2
.
constraints
[
i
]].
distance2
)
identical
=
false
;
if
(
identical
)
{
moleculeInstances
[
j
].
push_back
(
mol
.
atoms
[
0
]);
isNew
=
false
;
}
}
if
(
isNew
)
{
uniqueMolecules
.
push_back
(
mol
);
moleculeInstances
.
push_back
(
vector
<
int
>
());
moleculeInstances
[
moleculeInstances
.
size
()
-
1
].
push_back
(
mol
.
atoms
[
0
]);
}
}
gpu
->
moleculeGroups
.
resize
(
moleculeInstances
.
size
());
for
(
int
i
=
0
;
i
<
moleculeInstances
.
size
();
i
++
)
{
gpu
->
moleculeGroups
[
i
].
instances
=
moleculeInstances
[
i
];
vector
<
int
>&
atoms
=
uniqueMolecules
[
i
].
atoms
;
gpu
->
moleculeGroups
[
i
].
atoms
.
resize
(
atoms
.
size
());
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
gpu
->
moleculeGroups
[
i
].
atoms
[
j
]
=
atoms
[
j
]
-
atoms
[
0
];
}
}
extern
"C"
void
gpuReorderAtoms
(
gpuContext
gpu
)
{
if
(
gpu
->
natoms
==
0
||
gpu
->
sim
.
nonbondedCutoffSqr
==
0.0
)
return
;
if
(
gpu
->
moleculeGroups
.
size
()
==
0
)
findMoleculeGroups
(
gpu
);
// Find the range of positions and the number of bins along each axis.
int
numAtoms
=
gpu
->
natoms
;
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
float4
*
posq
=
gpu
->
psPosq4
->
_pSysData
;
float4
*
velm
=
gpu
->
psVelm4
->
_pSysData
;
float
minx
=
posq
[
0
].
x
,
maxx
=
posq
[
0
].
x
;
float
miny
=
posq
[
0
].
y
,
maxy
=
posq
[
0
].
y
;
float
minz
=
posq
[
0
].
z
,
maxz
=
posq
[
0
].
z
;
if
(
gpu
->
sim
.
nonbondedMethod
==
PERIODIC
)
{
minx
=
miny
=
minz
=
0.0
;
maxx
=
gpu
->
sim
.
periodicBoxSizeX
;
maxy
=
gpu
->
sim
.
periodicBoxSizeY
;
maxz
=
gpu
->
sim
.
periodicBoxSizeZ
;
}
else
{
for
(
int
i
=
1
;
i
<
numAtoms
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
i
].
x
);
maxx
=
max
(
maxx
,
posq
[
i
].
x
);
miny
=
min
(
miny
,
posq
[
i
].
y
);
maxy
=
max
(
maxy
,
posq
[
i
].
y
);
minz
=
min
(
minz
,
posq
[
i
].
z
);
maxz
=
max
(
maxz
,
posq
[
i
].
z
);
}
}
float
binWidth
=
0.2
*
sqrt
(
gpu
->
sim
.
nonbondedCutoffSqr
);
int
xbins
=
1
+
(
int
)
((
maxx
-
minx
)
/
binWidth
);
int
ybins
=
1
+
(
int
)
((
maxy
-
miny
)
/
binWidth
);
int
zbins
=
1
+
(
int
)
((
maxz
-
minz
)
/
binWidth
);
// Loop over each group of identical molecules and reorder them.
vector
<
int
>
originalIndex
(
numAtoms
);
vector
<
float4
>
newPosq
(
numAtoms
);
vector
<
float4
>
newVelm
(
numAtoms
);
for
(
int
group
=
0
;
group
<
gpu
->
moleculeGroups
.
size
();
group
++
)
{
// Find the center of each molecule.
gpuMoleculeGroup
&
mol
=
gpu
->
moleculeGroups
[
group
];
int
numMolecules
=
mol
.
instances
.
size
();
vector
<
int
>&
atoms
=
mol
.
atoms
;
vector
<
float3
>
molPos
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molPos
[
i
].
x
=
0.0
f
;
molPos
[
i
].
y
=
0.0
f
;
molPos
[
i
].
z
=
0.0
f
;
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
{
int
atom
=
atoms
[
j
]
+
mol
.
instances
[
i
];
molPos
[
i
].
x
+=
posq
[
atom
].
x
;
molPos
[
i
].
y
+=
posq
[
atom
].
y
;
molPos
[
i
].
z
+=
posq
[
atom
].
z
;
}
molPos
[
i
].
x
/=
atoms
.
size
();
molPos
[
i
].
y
/=
atoms
.
size
();
molPos
[
i
].
z
/=
atoms
.
size
();
}
if
(
gpu
->
sim
.
nonbondedMethod
==
PERIODIC
)
{
// Move each molecule position into the same box.
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molPos
[
i
].
x
-=
floor
(
molPos
[
i
].
x
/
gpu
->
sim
.
periodicBoxSizeX
)
*
gpu
->
sim
.
periodicBoxSizeX
;
molPos
[
i
].
y
-=
floor
(
molPos
[
i
].
y
/
gpu
->
sim
.
periodicBoxSizeY
)
*
gpu
->
sim
.
periodicBoxSizeY
;
molPos
[
i
].
z
-=
floor
(
molPos
[
i
].
z
/
gpu
->
sim
.
periodicBoxSizeZ
)
*
gpu
->
sim
.
periodicBoxSizeZ
;
}
}
// Select a bin for each molecule, then sort them by bin.
vector
<
pair
<
int
,
int
>
>
molBins
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
int
x
=
(
int
)
((
molPos
[
i
].
x
-
minx
)
/
binWidth
);
int
y
=
(
int
)
((
molPos
[
i
].
y
-
miny
)
/
binWidth
);
int
z
=
(
int
)
((
molPos
[
i
].
z
-
minz
)
/
binWidth
);
int
yodd
=
y
&
1
;
int
zodd
=
z
&
1
;
int
bin
=
z
*
xbins
*
ybins
;
bin
+=
(
zodd
?
ybins
-
y
:
y
)
*
xbins
;
bin
+=
(
yodd
?
xbins
-
x
:
x
);
molBins
[
i
]
=
pair
<
int
,
int
>
(
bin
,
i
);
}
sort
(
molBins
.
begin
(),
molBins
.
end
());
// Reorder the atoms.
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
for
(
int
j
=
0
;
j
<
atoms
.
size
();
j
++
)
{
int
oldIndex
=
mol
.
instances
[
molBins
[
i
].
second
]
+
atoms
[
j
];
int
newIndex
=
mol
.
instances
[
i
]
+
atoms
[
j
];
originalIndex
[
newIndex
]
=
gpu
->
psAtomIndex
->
_pSysStream
[
0
][
oldIndex
];
newPosq
[
newIndex
]
=
posq
[
oldIndex
];
newVelm
[
newIndex
]
=
velm
[
oldIndex
];
}
}
}
// Update the streams.
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
posq
[
i
]
=
newPosq
[
i
];
gpu
->
psPosq4
->
Upload
();
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
velm
[
i
]
=
newVelm
[
i
];
gpu
->
psVelm4
->
Upload
();
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
gpu
->
psAtomIndex
->
_pSysData
[
i
]
=
originalIndex
[
i
];
gpu
->
psAtomIndex
->
Upload
();
}
platforms/cuda/src/kernels/gputypes.h
View file @
1010df33
...
@@ -33,14 +33,20 @@
...
@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudatypes.h"
#include "cudpp.h"
#include <vector>
#include <vector>
struct
gpuAtomType
{
struct
gpuAtomType
{
string
name
;
std
::
string
name
;
char
symbol
;
char
symbol
;
float
r
;
float
r
;
};
};
struct
gpuMoleculeGroup
{
std
::
vector
<
int
>
atoms
;
std
::
vector
<
int
>
instances
;
};
enum
SM_VERSION
enum
SM_VERSION
{
{
SM_10
,
SM_10
,
...
@@ -61,8 +67,9 @@ struct _gpuContext {
...
@@ -61,8 +67,9 @@ struct _gpuContext {
int
gAtomTypes
;
int
gAtomTypes
;
cudaGmxSimulation
sim
;
cudaGmxSimulation
sim
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pE
xclusion
;
std
::
vector
<
std
::
vector
<
int
>
>
e
xclusion
s
;
unsigned
char
*
pAtomSymbol
;
unsigned
char
*
pAtomSymbol
;
std
::
vector
<
gpuMoleculeGroup
>
moleculeGroups
;
float
iterations
;
float
iterations
;
float
epsfac
;
float
epsfac
;
float
solventDielectric
;
float
solventDielectric
;
...
@@ -71,8 +78,11 @@ struct _gpuContext {
...
@@ -71,8 +78,11 @@ struct _gpuContext {
bool
bCalculateCM
;
bool
bCalculateCM
;
bool
bRemoveCM
;
bool
bRemoveCM
;
bool
bRecalculateBornRadii
;
bool
bRecalculateBornRadii
;
bool
bOutputBufferPerWarp
;
bool
bIncludeGBSA
;
unsigned
long
seed
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
SM_VERSION
sm_version
;
CUDPPHandle
cudpp
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
CUDAStream
<
float4
>*
psOldPosq4
;
...
@@ -103,15 +113,21 @@ struct _gpuContext {
...
@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
int4
>*
psSettleID
;
CUDAStream
<
float2
>*
psSettleParameter
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractingWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractionFlag
;
CUDAStream
<
size_t
>*
psInteractionCount
;
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
CUDAStream
<
int
>*
psAtomIndex
;
// The original index of each atom
CUDAStream
<
float4
>*
psGridBoundingBox
;
// The size of each grid cell
CUDAStream
<
float4
>*
psGridCenter
;
// The center and radius for each grid cell
};
};
typedef
struct
_gpuContext
*
gpuContext
;
typedef
struct
_gpuContext
*
gpuContext
;
...
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
...
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
extern
"C"
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
);
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
);
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
...
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
...
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern
"C"
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
vector
<
int
>
>&
exclusions
);
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusions
,
CudaNonbondedMethod
method
);
extern
"C"
void
gpuSetNonbondedCutoff
(
gpuContext
gpu
,
float
cutoffDistance
,
float
solventDielectric
);
extern
"C"
void
gpuSetPeriodicBoxSize
(
gpuContext
gpu
,
float
xsize
,
float
ysize
,
float
zsize
);
extern
"C"
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
...
@@ -227,7 +249,7 @@ extern "C"
...
@@ -227,7 +249,7 @@ extern "C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
extern
"C"
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
);
void
gpuBuildExclusionList
(
gpuContext
gpu
);
extern
"C"
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
);
int
gpuSetConstants
(
gpuContext
gpu
);
...
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
...
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern
"C"
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
extern
"C"
void
gpuReorderAtoms
(
gpuContext
gpu
);
#endif //__GPUTYPES_H__
#endif //__GPUTYPES_H__
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
View file @
1010df33
...
@@ -54,15 +54,8 @@ struct Atom {
...
@@ -54,15 +54,8 @@ struct Atom {
float
fx
;
float
fx
;
float
fy
;
float
fy
;
float
fz
;
float
fz
;
float
eps2
;
float
sig2
;
};
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
...
@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
...
@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
__global__
void
kCalculateCDLJForces_kernel
()
// Include versions of the kernels for N^2 calculations.
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
#define METHOD_NAME(a, b) a##N2##b
// into shared memory
#include "kCalculateCDLJForces.h"
end
=
end
-
pos
;
#define USE_OUTPUT_BUFFER_PER_WARP
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
while
(
pos
>=
0
)
// Include versions of the kernels with cutoffs.
{
// Extract cell coordinates from appropriate work unit
#undef METHOD_NAME
unsigned
int
x
=
sWorkUnit
[
pos
];
#undef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
#define USE_CUTOFF
bool
bExclusionFlag
=
(
x
&
0x1
);
#define METHOD_NAME(a, b) a##Cutoff##b
x
=
(
x
>>
17
)
<<
GRIDBITS
;
#include "kCalculateCDLJForces.h"
float4
apos
;
// Local atom x, y, z, q
#include "kFindInteractingBlocks.h"
float3
af
;
// Local atom fx, fy, fz
#define USE_OUTPUT_BUFFER_PER_WARP
float
dx
;
#undef METHOD_NAME
float
dy
;
#define METHOD_NAME(a, b) a##CutoffByWarp##b
float
dz
;
#include "kCalculateCDLJForces.h"
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
// Include versions of the kernels with periodic boundary conditions.
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
#undef METHOD_NAME
{
#undef USE_OUTPUT_BUFFER_PER_WARP
dx
=
psA
[
tj
].
x
-
apos
.
x
;
#define USE_PERIODIC
dy
=
psA
[
tj
].
y
-
apos
.
y
;
#define METHOD_NAME(a, b) a##Periodic##b
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#include "kCalculateCDLJForces.h"
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#include "kFindInteractingBlocks.h"
invR
=
1.0
f
/
sqrt
(
r2
);
#define USE_OUTPUT_BUFFER_PER_WARP
sig
=
a
.
x
+
psA
[
tj
].
sig
;
#undef METHOD_NAME
sig2
=
invR
*
sig
;
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
sig2
*=
sig2
;
#include "kCalculateCDLJForces.h"
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
__global__
extern
void
kCalculateCDLJCutoffForces_12_kernel
();
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult
result
;
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
case
NO_CUTOFF
:
dy
=
psA
[
j
].
y
-
apos
.
y
;
if
(
gpu
->
bOutputBufferPerWarp
)
dz
=
psA
[
j
].
z
-
apos
.
z
;
kCalculateCDLJN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
invR
=
1.0
f
/
sqrt
(
r2
);
else
sig
=
psA
[
tgx
].
sig2
+
psA
[
j
].
sig
;
kCalculateCDLJN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sig2
=
invR
*
sig
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
sig2
*=
sig2
;
LAUNCHERROR
(
"kCalculateCDLJN2Forces"
);
sig6
=
sig2
*
sig2
*
sig2
;
break
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
j
].
eps
;
case
CUTOFF
:
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
dEdR
*=
invR
*
invR
;
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
if
(
!
(
excl
&
0x1
))
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
{
dEdR
=
0.0
f
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
}
exit
(
-
1
);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// 100% utilization
gpu
->
psInteractionCount
->
Download
();
{
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
// Read fixed atom data into registers and GRF
if
(
gpu
->
bOutputBufferPerWarp
)
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
kCalculateCDLJCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
int
j
=
y
+
tgx
;
else
unsigned
int
i
=
x
+
tgx
;
kCalculateCDLJCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
float4
temp
=
cSim
.
pPosq
[
j
];
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
float2
temp1
=
cSim
.
pAttr
[
j
];
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
apos
=
cSim
.
pPosq
[
i
];
break
;
float2
a
=
cSim
.
pAttr
[
i
];
case
PERIODIC
:
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
if
(
result
!=
CUDPP_SUCCESS
)
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
{
dEdR
=
0.0
f
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
}
exit
(
-
1
);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
}
gpu
->
psInteractionCount
->
Download
();
// Write results
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
float4
of
;
if
(
gpu
->
bOutputBufferPerWarp
)
of
.
x
=
af
.
x
;
kCalculateCDLJPeriodicByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
of
.
y
=
af
.
y
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
else
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kCalculateCDLJPeriodicForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
LAUNCHERROR
(
"kCalculateCDLJForces"
);
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
}
}
}
\ No newline at end of file
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
View file @
1010df33
...
@@ -33,9 +33,6 @@
...
@@ -33,9 +33,6 @@
#include <cuda.h>
#include <cuda.h>
#include <vector_functions.h>
#include <vector_functions.h>
#include <cstdlib>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
using
namespace
std
;
#include "gputypes.h"
#include "gputypes.h"
...
@@ -54,15 +51,8 @@ struct Atom {
...
@@ -54,15 +51,8 @@ struct Atom {
float
fy
;
float
fy
;
float
fz
;
float
fz
;
float
fb
;
float
fb
;
float
q2
;
float
junk
;
};
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
...
@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
...
@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
__global__
void
kCalculateCDLJObcGbsaForces1_kernel
()
// Include versions of the kernel for N^2 calculations.
{
// Read queue of work blocks once so the remainder of
#define METHOD_NAME(a, b) a##N2##b
// kernel can run asynchronously
#include "kCalculateCDLJObcGbsaForces1.h"
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
#define USE_OUTPUT_BUFFER_PER_WARP
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
#undef METHOD_NAME
if
(
threadIdx
.
x
<
end
-
pos
)
#define METHOD_NAME(a, b) a##N2ByWarp##b
{
#include "kCalculateCDLJObcGbsaForces1.h"
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
// Include versions of the kernel with cutoffs.
if
(
threadIdx
.
x
<
GRID
)
{
#undef METHOD_NAME
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
#undef USE_OUTPUT_BUFFER_PER_WARP
}
#define USE_CUTOFF
__syncthreads
();
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern
__global__
void
kFindBlockBoundsCutoff_kernel
();
extern
__global__
void
kFindBlockBoundsPeriodic_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsCutoff_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsPeriodic_kernel
();
// Now change pos and end to reflect work queue just read
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
// into shared memory
{
end
=
end
-
pos
;
// printf("kCalculateCDLJObcGbsaForces1\n");
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
// check if Born radii need to be calculated
{
// Extract cell coordinates from appropriate work unit
kClearBornForces
(
gpu
);
unsigned
int
x
=
sWorkUnit
[
pos
];
CUDPPResult
result
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
size_t
numWithInteractions
;
bool
bExclusionFlag
=
(
x
&
0x1
);
switch
(
gpu
->
sim
.
nonbondedMethod
)
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
{
// Read fixed atom data into registers and GRF
case
NO_CUTOFF
:
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
if
(
gpu
->
bRecalculateBornRadii
)
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
kCalculateObcGbsaBornSum
(
gpu
);
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
kReduceObcGbsaBornSum
(
gpu
);
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
}
else
// 100% utilization
if
(
gpu
->
bOutputBufferPerWarp
)
{
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
// Read fixed atom data into registers and GRF
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
int
j
=
y
+
tgx
;
else
float4
temp
=
cSim
.
pPosq
[
j
];
kCalculateCDLJObcGbsaN2Forces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
float2
temp1
=
cSim
.
pAttr
[
j
];
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
LAUNCHERROR
(
"kCalculateCDLJObcGbsaN2Forces1"
);
float4
af
;
break
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
case
CUTOFF
:
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
if
(
result
!=
CUDPP_SUCCESS
)
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
exit
(
-
1
);
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
}
gpu
->
psInteractionCount
->
Download
();
else
// bExclusion
if
(
gpu
->
bRecalculateBornRadii
)
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
{
dEdR
=
0.0
f
;
kCalculateObcGbsaBornSum
(
gpu
);
}
kReduceObcGbsaBornSum
(
gpu
);
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
}
else
// 100% utilization
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
{
if
(
gpu
->
bOutputBufferPerWarp
)
// Read fixed atom data into registers and GRF
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
float4
af
;
else
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
kCalculateCDLJObcGbsaCutoffForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
break
;
int
j
=
y
+
tgx
;
case
PERIODIC
:
float4
temp
=
cSim
.
pPosq
[
j
];
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
float2
temp1
=
cSim
.
pAttr
[
j
];
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
if
(
result
!=
CUDPP_SUCCESS
)
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
{
dEdR
=
0.0
f
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
}
exit
(
-
1
);
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
}
gpu
->
psInteractionCount
->
Download
();
// Write results
if
(
gpu
->
bRecalculateBornRadii
)
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
{
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJObcGbsaForces1_12_kernel
();
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if
(
gpu
->
bRecalculateBornRadii
){
kCalculateObcGbsaBornSum
(
gpu
);
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
}
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
sm_version
<
SM_12
)
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
else
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kCalculateCDLJObcGbsaPeriodicForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
if
(
0
){
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
static
int
step
=
0
;
break
;
// int numPrint = -1;
}
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1"
);
}
}
platforms/cuda/src/kernels/kCalculateLocalForces.cu
View file @
1010df33
...
@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
...
@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
pos
+=
blockDim
.
x
*
gridDim
.
x
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
if
(
cSim
.
nonbondedMethod
==
NO_CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
while
(
pos
<
cSim
.
LJ14_offset
)
{
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
...
@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
...
@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
}
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
}
else
if
(
cSim
.
nonbondedMethod
==
CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
else
if
(
cSim
.
nonbondedMethod
==
PERIODIC
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
d
.
x
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
d
.
y
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
d
.
z
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
}
}
...
...
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
View file @
1010df33
...
@@ -53,10 +53,6 @@ struct Atom {
...
@@ -53,10 +53,6 @@ struct Atom {
float
junk
;
float
junk
;
};
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
...
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
...
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__
void
kClearObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
((
float
*
)
cSim
.
pBornSum
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kReduceObcGbsaBornSum_kernel
()
__global__
void
kReduceObcGbsaBornSum_kernel
()
{
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
...
@@ -127,175 +167,40 @@ if( 0 ){
...
@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
}
__global__
void
kCalculateObcGbsaBornSum_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
(
blockIdx
.
x
*
cSim
.
workUnits
)
/
gridDim
.
x
;
int
end
=
((
blockIdx
.
x
+
1
)
*
cSim
.
workUnits
)
/
gridDim
.
x
;
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
-
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
j
].
r
-
r
))
{
apos
.
w
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
tj
].
sr
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
psA
[
tj
].
r
<
(
ar
.
y
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
{
// printf("kCalculateObcgbsaBornSum\n");
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kClearObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornSum"
);
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateObcGbsaN2BornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
break
;
case
CUTOFF
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaCutoffBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateBornSum"
);
LAUNCHERROR
(
"kCalculateBornSum"
);
}
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
View file @
1010df33
...
@@ -52,18 +52,9 @@ struct Atom {
...
@@ -52,18 +52,9 @@ struct Atom {
float
fy
;
float
fy
;
float
fz
;
float
fz
;
float
fb
;
float
fb
;
// float sum;
// float oneOverR;
int
pos
;
int
wx
;
int
wy
;
};
};
__shared__
Atom
sA
[
G8X_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
...
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
...
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
__global__
void
kCalculateObcGbsaForces2_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
sA
[
threadIdx
.
x
].
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
sA
[
threadIdx
.
x
].
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
sA
[
threadIdx
.
x
].
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
sA
[
threadIdx
.
x
].
wx
=
x
;
sA
[
threadIdx
.
x
].
wy
=
y
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born Forces and sum
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
// Include versions of the kernels for N^2 calculations.
float
d
=
dx
*
dE
;
#define METHOD_NAME(a, b) a##N2##b
af
.
x
-=
d
;
#include "kCalculateObcGbsaForces2.h"
psA
[
tj
].
fx
+=
d
;
#define USE_OUTPUT_BUFFER_PER_WARP
d
=
dy
*
dE
;
#undef METHOD_NAME
af
.
y
-=
d
;
#define METHOD_NAME(a, b) a##N2ByWarp##b
psA
[
tj
].
fy
+=
d
;
#include "kCalculateObcGbsaForces2.h"
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born Forces and sum
// Include versions of the kernels with cutoffs.
float
rScaledRadiusI
=
r
+
a
.
y
;
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
u_ij
=
1.0
f
/
rScaledRadiusI
;
l_ij2
=
l_ij
*
l_ij
;
u_ij2
=
u_ij
*
u_ij
;
t1
=
log
(
u_ij
/
l_ij
);
t2
=
(
l_ij2
-
u_ij2
);
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
#undef METHOD_NAME
term
=
0.125
f
*
#undef USE_OUTPUT_BUFFER_PER_WARP
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3
+
#define USE_CUTOFF
0.250
f
*
t1
*
r2Inverse
;
#define METHOD_NAME(a, b) a##Cutoff##b
dE
=
psA
[
tj
].
fb
*
term
;
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Born sum term
// Include versions of the kernels with periodic boundary conditions.
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
#undef METHOD_NAME
}
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
// Write results
#define METHOD_NAME(a, b) a##Periodic##b
int
offset
=
sA
[
threadIdx
.
x
].
wx
+
tgx
+
(
sA
[
threadIdx
.
x
].
wy
>>
GRIDBITS
)
*
cSim
.
stride
;
#include "kCalculateObcGbsaForces2.h"
float4
of
;
#define USE_OUTPUT_BUFFER_PER_WARP
of
.
x
=
af
.
x
;
#undef METHOD_NAME
of
.
y
=
af
.
y
;
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
of
.
z
=
af
.
z
;
#include "kCalculateObcGbsaForces2.h"
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
sA
[
threadIdx
.
x
].
wy
+
tgx
+
(
sA
[
threadIdx
.
x
].
wx
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA
[
threadIdx
.
x
].
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces2_12_kernel
();
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
{
//printf("kCalculateObcGbsaForces2\n");
//printf("kCalculateObcGbsaForces2\n");
if
(
gpu
->
sm_version
<
SM_12
)
size_t
numWithInteractions
;
kCalculateObcGbsaForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
else
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
kCalculateObcGbsaN2Forces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
if
(
0
){
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
static
int
step
=
0
;
break
;
//int numPrint = -1;
case
CUTOFF
:
step
++
;
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
if
(
gpu
->
bOutputBufferPerWarp
)
//gpuDumpCoordinates( gpu );
kCalculateObcGbsaCutoffByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
kReduceBornSumAndForces
(
gpu
);
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
gpuDumpObcLoop1
(
gpu
);
else
}
kCalculateObcGbsaCutoffForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
}
}
platforms/cuda/src/kernels/kForces.cu
View file @
1010df33
...
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
...
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__
void
kClearForces_kernel
()
__global__
void
kClearForces_kernel
()
{
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
4
*
cSim
.
outputBuffers
)
while
(
pos
<
cSim
.
stride
*
cSim
.
outputBuffers
)
{
{
((
float
*
)
cSim
.
pForce4
)
[
pos
]
=
0.0
f
;
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
)
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
}
}
...
...
platforms/cuda/src/kernels/kVerletUpdate.cu
View file @
1010df33
...
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
...
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__
void
kVerletUpdatePart1_kernel
()
__global__
void
kVerletUpdatePart1_kernel
()
{
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
...
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__
void
kVerletUpdatePart2_kernel
()
__global__
void
kVerletUpdatePart2_kernel
()
{
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
...
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern
__shared__
float3
sCM
[];
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
...
platforms/reference/src/ReferenceKernels.cpp
View file @
1010df33
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2008 Stanford University and the Authors.
*
* Portions copyright (c) 2008
-2009
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
...
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters
->
setScaledRadiusFactors
(
scaleFactors
);
obcParameters
->
setScaledRadiusFactors
(
scaleFactors
);
obcParameters
->
setSolventDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSolventDielectric
())
);
obcParameters
->
setSolventDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSolventDielectric
())
);
obcParameters
->
setSoluteDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSoluteDielectric
())
);
obcParameters
->
setSoluteDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSoluteDielectric
())
);
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for
(
int
i
=
0
;
i
<
system
.
getNumForces
();
i
++
)
{
const
NonbondedForce
*
nonbonded
=
dynamic_cast
<
const
NonbondedForce
*>
(
&
system
.
getForce
(
i
));
if
(
nonbonded
!=
NULL
)
{
if
(
nonbonded
->
getNonbondedMethod
()
!=
NonbondedForce
::
NoCutoff
)
obcParameters
->
setUseCutoff
(
nonbonded
->
getCutoffDistance
());
if
(
nonbonded
->
getNonbondedMethod
()
==
NonbondedForce
::
CutoffPeriodic
)
{
Vec3
boxVectors
[
3
];
nonbonded
->
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
RealOpenMM
periodicBoxSize
[
3
];
periodicBoxSize
[
0
]
=
(
RealOpenMM
)
boxVectors
[
0
][
0
];
periodicBoxSize
[
1
]
=
(
RealOpenMM
)
boxVectors
[
1
][
1
];
periodicBoxSize
[
2
]
=
(
RealOpenMM
)
boxVectors
[
2
][
2
];
obcParameters
->
setPeriodic
(
periodicBoxSize
);
}
break
;
}
}
obc
=
new
CpuObc
(
obcParameters
);
obc
=
new
CpuObc
(
obcParameters
);
obc
->
setIncludeAceApproximation
(
true
);
obc
->
setIncludeAceApproximation
(
true
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment