Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1010df33
Commit
1010df33
authored
Feb 19, 2009
by
Peter Eastman
Browse files
Checking in Cuda implementation of explicit solvent
parent
df4b64cb
Changes
20
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1260 additions
and
1428 deletions
+1260
-1428
platforms/cuda/CMakeLists.txt
platforms/cuda/CMakeLists.txt
+97
-87
platforms/cuda/cudpp/mac/libcudpp.a
platforms/cuda/cudpp/mac/libcudpp.a
+0
-0
platforms/cuda/cudpp/mac/libcutil.a
platforms/cuda/cudpp/mac/libcutil.a
+0
-0
platforms/cuda/cudpp/win/cudpp.lib
platforms/cuda/cudpp/win/cudpp.lib
+0
-0
platforms/cuda/cudpp/win/cudpp32.dll
platforms/cuda/cudpp/win/cudpp32.dll
+0
-0
platforms/cuda/cudpp/win/cutil.lib
platforms/cuda/cudpp/win/cutil.lib
+0
-0
platforms/cuda/src/CudaStreamFactory.cpp
platforms/cuda/src/CudaStreamFactory.cpp
+9
-11
platforms/cuda/src/CudaStreamImpl.h
platforms/cuda/src/CudaStreamImpl.h
+14
-12
platforms/cuda/src/kernels/cudaKernels.h
platforms/cuda/src/kernels/cudaKernels.h
+4
-8
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+27
-2
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+567
-120
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+34
-9
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+93
-308
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+110
-374
platforms/cuda/src/kernels/kCalculateLocalForces.cu
platforms/cuda/src/kernels/kCalculateLocalForces.cu
+147
-41
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+76
-171
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+59
-279
platforms/cuda/src/kernels/kForces.cu
platforms/cuda/src/kernels/kForces.cu
+2
-2
platforms/cuda/src/kernels/kVerletUpdate.cu
platforms/cuda/src/kernels/kVerletUpdate.cu
+0
-3
platforms/reference/src/ReferenceKernels.cpp
platforms/reference/src/ReferenceKernels.cpp
+21
-1
No files found.
platforms/cuda/CMakeLists.txt
View file @
1010df33
...
@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
...
@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
SET
(
FINDCUDA_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/cuda-cmake
)
IF
(
APPLE
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/mac
)
ELSE
(
APPLE
)
IF
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win
)
INSTALL_FILES
(
/lib FILES
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/win/cudpp32.dll
)
ELSE
(
WIN32
)
LINK_DIRECTORIES
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cudpp/linux
)
ENDIF
(
WIN32
)
ENDIF
(
APPLE
)
SUBDIRS
(
sharedTarget staticTarget
)
SUBDIRS
(
sharedTarget staticTarget
)
platforms/cuda/cudpp/mac/libcudpp.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/mac/libcutil.a
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cudpp32.dll
0 → 100644
View file @
1010df33
File added
platforms/cuda/cudpp/win/cutil.lib
0 → 100644
View file @
1010df33
File added
platforms/cuda/src/CudaStreamFactory.cpp
View file @
1010df33
...
@@ -39,42 +39,40 @@
...
@@ -39,42 +39,40 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
StreamImpl
*
CudaStreamFactory
::
createStreamImpl
(
std
::
string
name
,
int
size
,
Stream
::
DataType
type
,
const
Platform
&
platform
,
OpenMMContextImpl
&
context
)
const
{
StreamImpl
*
CudaStreamFactory
::
createStreamImpl
(
std
::
string
name
,
int
size
,
Stream
::
DataType
type
,
const
Platform
&
platform
,
OpenMMContextImpl
&
context
)
const
{
if
(
name
==
"particlePositions"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
if
(
name
==
"particlePositions"
)
{
float
padding
[]
=
{
100000.0
f
,
100000.0
f
,
100000.0
f
,
0.2
f
};
float
padding
[]
=
{
100000.0
f
,
100000.0
f
,
100000.0
f
,
0.2
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psPosq4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psPosq4
,
4
,
padding
,
data
.
gpu
);
}
}
if
(
name
==
"particleVelocities"
)
{
if
(
name
==
"particleVelocities"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psVelm4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psVelm4
,
4
,
padding
,
data
.
gpu
);
}
}
if
(
name
==
"particleForces"
)
{
if
(
name
==
"particleForces"
)
{
CudaPlatform
::
PlatformData
&
data
=
*
static_cast
<
CudaPlatform
::
PlatformData
*>
(
context
.
getPlatformData
());
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float
padding
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psForce4
,
4
,
padding
,
data
.
gpu
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
data
.
gpu
->
psForce4
,
4
,
padding
,
data
.
gpu
);
}
}
switch
(
type
)
{
switch
(
type
)
{
case
Stream
::
Float
:
case
Stream
::
Float
:
case
Stream
::
Double
:
case
Stream
::
Double
:
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float2
:
case
Stream
::
Float2
:
case
Stream
::
Double2
:
case
Stream
::
Double2
:
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float3
:
case
Stream
::
Float3
:
case
Stream
::
Double3
:
case
Stream
::
Double3
:
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Float4
:
case
Stream
::
Float4
:
case
Stream
::
Double4
:
case
Stream
::
Double4
:
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
float4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer
:
case
Stream
::
Integer
:
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int1
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer2
:
case
Stream
::
Integer2
:
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int2
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer3
:
case
Stream
::
Integer3
:
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int3
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
case
Stream
::
Integer4
:
case
Stream
::
Integer4
:
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
NULL
);
return
new
CudaStreamImpl
<
int4
>
(
name
,
size
,
type
,
platform
,
1
,
data
.
gpu
);
}
}
throw
OpenMMException
(
"Tried to create a Stream with an illegal DataType."
);
throw
OpenMMException
(
"Tried to create a Stream with an illegal DataType."
);
}
}
platforms/cuda/src/CudaStreamImpl.h
View file @
1010df33
...
@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
...
@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
template
<
class
T
>
template
<
class
T
>
void
CudaStreamImpl
<
T
>::
loadFromArray
(
const
void
*
array
)
{
void
CudaStreamImpl
<
T
>::
loadFromArray
(
const
void
*
array
)
{
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
else
if
(
baseType
==
Stream
::
Double
)
{
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
else
{
else
{
int
*
arrayData
=
(
int
*
)
array
;
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
i
*
width
+
j
];
data
[
i
*
rowOffset
+
j
]
=
(
float
)
arrayData
[
order
[
i
]
*
width
+
j
];
}
}
for
(
int
i
=
getSize
();
i
<
(
int
)
stream
->
_length
;
++
i
)
for
(
int
i
=
getSize
();
i
<
(
int
)
stream
->
_length
;
++
i
)
for
(
int
j
=
0
;
j
<
rowOffset
;
++
j
)
for
(
int
j
=
0
;
j
<
rowOffset
;
++
j
)
...
@@ -167,23 +168,24 @@ template <class T>
...
@@ -167,23 +168,24 @@ template <class T>
void
CudaStreamImpl
<
T
>::
saveToArray
(
void
*
array
)
{
void
CudaStreamImpl
<
T
>::
saveToArray
(
void
*
array
)
{
stream
->
Download
();
stream
->
Download
();
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
float
*
data
=
reinterpret_cast
<
float
*>
(
stream
->
_pSysData
);
int
*
order
=
gpu
->
psAtomIndex
->
_pSysData
;
if
(
baseType
==
Stream
::
Float
)
{
if
(
baseType
==
Stream
::
Float
)
{
float
*
arrayData
=
(
float
*
)
array
;
float
*
arrayData
=
(
float
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
}
else
if
(
baseType
==
Stream
::
Double
)
{
else
if
(
baseType
==
Stream
::
Double
)
{
double
*
arrayData
=
(
double
*
)
array
;
double
*
arrayData
=
(
double
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
data
[
i
*
rowOffset
+
j
];
}
}
else
{
else
{
int
*
arrayData
=
(
int
*
)
array
;
int
*
arrayData
=
(
int
*
)
array
;
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
getSize
();
++
i
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
for
(
int
j
=
0
;
j
<
width
;
++
j
)
arrayData
[
i
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
arrayData
[
order
[
i
]
*
width
+
j
]
=
(
int
)
data
[
i
*
rowOffset
+
j
];
}
}
}
}
...
...
platforms/cuda/src/kernels/cudaKernels.h
View file @
1010df33
...
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
...
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kApplyFirstSettle
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kApplySecondSettle
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
...
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
...
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
// Initializers
// Initializers
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
...
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
...
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetSettleSim
(
gpuContext
gpu
);
extern
void
GetSettleSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
...
...
platforms/cuda/src/kernels/cudatypes.h
View file @
1010df33
...
@@ -36,11 +36,12 @@
...
@@ -36,11 +36,12 @@
#include <limits>
#include <limits>
#include <iostream>
#include <iostream>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector_functions.h>
using
namespace
std
;
#define RTERROR(status, s) \
#define RTERROR(status, s) \
if
(
status
!=
cudaSuccess
)
{
\
if
(
status
!=
cudaSuccess
)
{
\
...
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
...
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
enum
CudaNonbondedMethod
{
NO_CUTOFF
,
CUTOFF
,
PERIODIC
};
struct
cudaGmxSimulation
{
struct
cudaGmxSimulation
{
// Constants
// Constants
...
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
...
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
interaction_blocks
;
// Number of blocks to launch when identifying interacting tiles
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
...
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
...
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
settle_threads_per_block
;
// Threads per block in SETTLE kernel calls
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
interaction_threads_per_block
;
// Threads per block when identifying interacting tiles
unsigned
int
workUnits
;
// Number of work units
unsigned
int
workUnits
;
// Number of work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
*
pInteractingWorkUnit
;
// Pointer to work units that have interactions
unsigned
int
*
pInteractionFlag
;
// Flags for which work units have interactions
size_t
*
pInteractionCount
;
// A count of the number of work units which have interactions
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
...
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
...
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
unsigned
int
outputBuffers
;
// Number of output buffers
unsigned
int
outputBuffers
;
// Number of output buffers
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
epsfac
;
// Epsilon factor for CDLJ calculations
float
epsfac
;
// Epsilon factor for CDLJ calculations
CudaNonbondedMethod
nonbondedMethod
;
// How to handle nonbonded interactions
float
nonbondedCutoffSqr
;
// Cutoff distance for CDLJ calculations
float
periodicBoxSizeX
;
// The X dimension of the periodic box
float
periodicBoxSizeY
;
// The Y dimension of the periodic box
float
periodicBoxSizeZ
;
// The Z dimension of the periodic box
float
reactionFieldK
;
// Constant for reaction field correction
float
probeRadius
;
// SASA probe radius
float
probeRadius
;
// SASA probe radius
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
electricConstant
;
// ACE approximation electric constant
float
electricConstant
;
// ACE approximation electric constant
...
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
...
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float
inverseTotalMass
;
// Used in linear momentum removal
float
inverseTotalMass
;
// Used in linear momentum removal
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
settleConstraints
;
// Total number of Settle constraints
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
...
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
...
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
int
*
pNonShakeID
;
// Not Shaking atoms
int
*
pNonShakeID
;
// Not Shaking atoms
int4
*
pShakeID
;
// Shake atoms and phase
int4
*
pShakeID
;
// Shake atoms and phase
float4
*
pShakeParameter
;
// Shake parameters
float4
*
pShakeParameter
;
// Shake parameters
int4
*
pSettleID
;
// Settle atoms
float2
*
pSettleParameter
;
// Settle parameters
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
int
*
pAtomIndex
;
// The original index of each atom
float4
*
pGridBoundingBox
;
// The size of each grid cell
float4
*
pGridCenter
;
// The center of each grid cell
// Mutable stuff
// Mutable stuff
float4
*
pPosq
;
// Pointer to atom positions and charges
float4
*
pPosq
;
// Pointer to atom positions and charges
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
1010df33
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/gputypes.h
View file @
1010df33
...
@@ -33,14 +33,20 @@
...
@@ -33,14 +33,20 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudatypes.h"
#include "cudpp.h"
#include <vector>
#include <vector>
struct
gpuAtomType
{
struct
gpuAtomType
{
string
name
;
std
::
string
name
;
char
symbol
;
char
symbol
;
float
r
;
float
r
;
};
};
struct
gpuMoleculeGroup
{
std
::
vector
<
int
>
atoms
;
std
::
vector
<
int
>
instances
;
};
enum
SM_VERSION
enum
SM_VERSION
{
{
SM_10
,
SM_10
,
...
@@ -61,8 +67,9 @@ struct _gpuContext {
...
@@ -61,8 +67,9 @@ struct _gpuContext {
int
gAtomTypes
;
int
gAtomTypes
;
cudaGmxSimulation
sim
;
cudaGmxSimulation
sim
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pE
xclusion
;
std
::
vector
<
std
::
vector
<
int
>
>
e
xclusion
s
;
unsigned
char
*
pAtomSymbol
;
unsigned
char
*
pAtomSymbol
;
std
::
vector
<
gpuMoleculeGroup
>
moleculeGroups
;
float
iterations
;
float
iterations
;
float
epsfac
;
float
epsfac
;
float
solventDielectric
;
float
solventDielectric
;
...
@@ -71,8 +78,11 @@ struct _gpuContext {
...
@@ -71,8 +78,11 @@ struct _gpuContext {
bool
bCalculateCM
;
bool
bCalculateCM
;
bool
bRemoveCM
;
bool
bRemoveCM
;
bool
bRecalculateBornRadii
;
bool
bRecalculateBornRadii
;
bool
bOutputBufferPerWarp
;
bool
bIncludeGBSA
;
unsigned
long
seed
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
SM_VERSION
sm_version
;
CUDPPHandle
cudpp
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
CUDAStream
<
float4
>*
psOldPosq4
;
...
@@ -103,15 +113,21 @@ struct _gpuContext {
...
@@ -103,15 +113,21 @@ struct _gpuContext {
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
int4
>*
psSettleID
;
CUDAStream
<
float2
>*
psSettleParameter
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractingWorkUnit
;
CUDAStream
<
unsigned
int
>*
psInteractionFlag
;
CUDAStream
<
size_t
>*
psInteractionCount
;
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
CUDAStream
<
int
>*
psAtomIndex
;
// The original index of each atom
CUDAStream
<
float4
>*
psGridBoundingBox
;
// The size of each grid cell
CUDAStream
<
float4
>*
psGridCenter
;
// The center and radius for each grid cell
};
};
typedef
struct
_gpuContext
*
gpuContext
;
typedef
struct
_gpuContext
*
gpuContext
;
...
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
...
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
extern
"C"
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
);
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
);
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
std
::
string
s
);
extern
"C"
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
...
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
...
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern
"C"
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
vector
<
int
>
>&
exclusions
);
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusions
,
CudaNonbondedMethod
method
);
extern
"C"
void
gpuSetNonbondedCutoff
(
gpuContext
gpu
,
float
cutoffDistance
,
float
solventDielectric
);
extern
"C"
void
gpuSetPeriodicBoxSize
(
gpuContext
gpu
,
float
xsize
,
float
ysize
,
float
zsize
);
extern
"C"
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
...
@@ -227,7 +249,7 @@ extern "C"
...
@@ -227,7 +249,7 @@ extern "C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
extern
"C"
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
);
void
gpuBuildExclusionList
(
gpuContext
gpu
);
extern
"C"
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
);
int
gpuSetConstants
(
gpuContext
gpu
);
...
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
...
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
extern
"C"
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
extern
"C"
void
gpuReorderAtoms
(
gpuContext
gpu
);
#endif //__GPUTYPES_H__
#endif //__GPUTYPES_H__
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
View file @
1010df33
...
@@ -54,15 +54,8 @@ struct Atom {
...
@@ -54,15 +54,8 @@ struct Atom {
float
fx
;
float
fx
;
float
fy
;
float
fy
;
float
fz
;
float
fz
;
float
eps2
;
float
sig2
;
};
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
...
@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
...
@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
__global__
void
kCalculateCDLJForces_kernel
()
// Include versions of the kernels for N^2 calculations.
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
#define METHOD_NAME(a, b) a##N2##b
// into shared memory
#include "kCalculateCDLJForces.h"
end
=
end
-
pos
;
#define USE_OUTPUT_BUFFER_PER_WARP
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
while
(
pos
>=
0
)
// Include versions of the kernels with cutoffs.
{
// Extract cell coordinates from appropriate work unit
#undef METHOD_NAME
unsigned
int
x
=
sWorkUnit
[
pos
];
#undef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
#define USE_CUTOFF
bool
bExclusionFlag
=
(
x
&
0x1
);
#define METHOD_NAME(a, b) a##Cutoff##b
x
=
(
x
>>
17
)
<<
GRIDBITS
;
#include "kCalculateCDLJForces.h"
float4
apos
;
// Local atom x, y, z, q
#include "kFindInteractingBlocks.h"
float3
af
;
// Local atom fx, fy, fz
#define USE_OUTPUT_BUFFER_PER_WARP
float
dx
;
#undef METHOD_NAME
float
dy
;
#define METHOD_NAME(a, b) a##CutoffByWarp##b
float
dz
;
#include "kCalculateCDLJForces.h"
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
// Include versions of the kernels with periodic boundary conditions.
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
#undef METHOD_NAME
{
#undef USE_OUTPUT_BUFFER_PER_WARP
dx
=
psA
[
tj
].
x
-
apos
.
x
;
#define USE_PERIODIC
dy
=
psA
[
tj
].
y
-
apos
.
y
;
#define METHOD_NAME(a, b) a##Periodic##b
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#include "kCalculateCDLJForces.h"
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#include "kFindInteractingBlocks.h"
invR
=
1.0
f
/
sqrt
(
r2
);
#define USE_OUTPUT_BUFFER_PER_WARP
sig
=
a
.
x
+
psA
[
tj
].
sig
;
#undef METHOD_NAME
sig2
=
invR
*
sig
;
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
sig2
*=
sig2
;
#include "kCalculateCDLJForces.h"
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
__global__
extern
void
kCalculateCDLJCutoffForces_12_kernel
();
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult
result
;
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
case
NO_CUTOFF
:
dy
=
psA
[
j
].
y
-
apos
.
y
;
if
(
gpu
->
bOutputBufferPerWarp
)
dz
=
psA
[
j
].
z
-
apos
.
z
;
kCalculateCDLJN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
invR
=
1.0
f
/
sqrt
(
r2
);
else
sig
=
psA
[
tgx
].
sig2
+
psA
[
j
].
sig
;
kCalculateCDLJN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sig2
=
invR
*
sig
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
sig2
*=
sig2
;
LAUNCHERROR
(
"kCalculateCDLJN2Forces"
);
sig6
=
sig2
*
sig2
*
sig2
;
break
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
j
].
eps
;
case
CUTOFF
:
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
dEdR
*=
invR
*
invR
;
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
if
(
!
(
excl
&
0x1
))
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
{
dEdR
=
0.0
f
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
}
exit
(
-
1
);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// 100% utilization
gpu
->
psInteractionCount
->
Download
();
{
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
// Read fixed atom data into registers and GRF
if
(
gpu
->
bOutputBufferPerWarp
)
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
kCalculateCDLJCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
int
j
=
y
+
tgx
;
else
unsigned
int
i
=
x
+
tgx
;
kCalculateCDLJCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
float4
temp
=
cSim
.
pPosq
[
j
];
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
float2
temp1
=
cSim
.
pAttr
[
j
];
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
apos
=
cSim
.
pPosq
[
i
];
break
;
float2
a
=
cSim
.
pAttr
[
i
];
case
PERIODIC
:
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
if
(
result
!=
CUDPP_SUCCESS
)
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
{
dEdR
=
0.0
f
;
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
}
exit
(
-
1
);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
}
gpu
->
psInteractionCount
->
Download
();
// Write results
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
float4
of
;
if
(
gpu
->
bOutputBufferPerWarp
)
of
.
x
=
af
.
x
;
kCalculateCDLJPeriodicByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
of
.
y
=
af
.
y
;
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
else
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kCalculateCDLJPeriodicForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
LAUNCHERROR
(
"kCalculateCDLJForces"
);
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
}
}
}
\ No newline at end of file
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
View file @
1010df33
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/kCalculateLocalForces.cu
View file @
1010df33
...
@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
...
@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
pos
+=
blockDim
.
x
*
gridDim
.
x
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
if
(
cSim
.
nonbondedMethod
==
NO_CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
while
(
pos
<
cSim
.
LJ14_offset
)
{
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
...
@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
...
@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
}
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
}
else
if
(
cSim
.
nonbondedMethod
==
CUTOFF
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
else
if
(
cSim
.
nonbondedMethod
==
PERIODIC
)
{
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
d
.
x
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeX
+
0.5
f
)
*
cSim
.
periodicBoxSizeX
;
d
.
y
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
d
.
z
-=
floor
(
d
.
x
/
cSim
.
periodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
(
inverseR
-
2.0
f
*
cSim
.
reactionFieldK
*
r2
);
dEdR
*=
inverseR
*
inverseR
;
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0.0
f
;
}
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
}
}
...
...
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
View file @
1010df33
...
@@ -53,10 +53,6 @@ struct Atom {
...
@@ -53,10 +53,6 @@ struct Atom {
float
junk
;
float
junk
;
};
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
...
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
...
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__
void
kClearObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
((
float
*
)
cSim
.
pBornSum
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kReduceObcGbsaBornSum_kernel
()
__global__
void
kReduceObcGbsaBornSum_kernel
()
{
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
...
@@ -127,175 +167,40 @@ if( 0 ){
...
@@ -127,175 +167,40 @@ if( 0 ){
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
}
__global__
void
kCalculateObcGbsaBornSum_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
(
blockIdx
.
x
*
cSim
.
workUnits
)
/
gridDim
.
x
;
int
end
=
((
blockIdx
.
x
+
1
)
*
cSim
.
workUnits
)
/
gridDim
.
x
;
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
-
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
j
].
r
-
r
))
{
apos
.
w
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
tj
].
sr
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
psA
[
tj
].
r
<
(
ar
.
y
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
{
// printf("kCalculateObcgbsaBornSum\n");
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
kClearObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornSum"
);
size_t
numWithInteractions
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
kCalculateObcGbsaN2BornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
break
;
case
CUTOFF
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaCutoffBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateBornSum"
);
LAUNCHERROR
(
"kCalculateBornSum"
);
}
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
View file @
1010df33
...
@@ -52,18 +52,9 @@ struct Atom {
...
@@ -52,18 +52,9 @@ struct Atom {
float
fy
;
float
fy
;
float
fz
;
float
fz
;
float
fb
;
float
fb
;
// float sum;
// float oneOverR;
int
pos
;
int
wx
;
int
wy
;
};
};
__shared__
Atom
sA
[
G8X_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
...
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
...
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
}
__global__
void
kCalculateObcGbsaForces2_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
sA
[
threadIdx
.
x
].
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
sA
[
threadIdx
.
x
].
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
sA
[
threadIdx
.
x
].
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
sA
[
threadIdx
.
x
].
wx
=
x
;
sA
[
threadIdx
.
x
].
wy
=
y
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born Forces and sum
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
// Include versions of the kernels for N^2 calculations.
float
d
=
dx
*
dE
;
#define METHOD_NAME(a, b) a##N2##b
af
.
x
-=
d
;
#include "kCalculateObcGbsaForces2.h"
psA
[
tj
].
fx
+=
d
;
#define USE_OUTPUT_BUFFER_PER_WARP
d
=
dy
*
dE
;
#undef METHOD_NAME
af
.
y
-=
d
;
#define METHOD_NAME(a, b) a##N2ByWarp##b
psA
[
tj
].
fy
+=
d
;
#include "kCalculateObcGbsaForces2.h"
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born Forces and sum
// Include versions of the kernels with cutoffs.
float
rScaledRadiusI
=
r
+
a
.
y
;
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
u_ij
=
1.0
f
/
rScaledRadiusI
;
l_ij2
=
l_ij
*
l_ij
;
u_ij2
=
u_ij
*
u_ij
;
t1
=
log
(
u_ij
/
l_ij
);
t2
=
(
l_ij2
-
u_ij2
);
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
#undef METHOD_NAME
term
=
0.125
f
*
#undef USE_OUTPUT_BUFFER_PER_WARP
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3
+
#define USE_CUTOFF
0.250
f
*
t1
*
r2Inverse
;
#define METHOD_NAME(a, b) a##Cutoff##b
dE
=
psA
[
tj
].
fb
*
term
;
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Born sum term
// Include versions of the kernels with periodic boundary conditions.
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
#undef METHOD_NAME
}
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
// Write results
#define METHOD_NAME(a, b) a##Periodic##b
int
offset
=
sA
[
threadIdx
.
x
].
wx
+
tgx
+
(
sA
[
threadIdx
.
x
].
wy
>>
GRIDBITS
)
*
cSim
.
stride
;
#include "kCalculateObcGbsaForces2.h"
float4
of
;
#define USE_OUTPUT_BUFFER_PER_WARP
of
.
x
=
af
.
x
;
#undef METHOD_NAME
of
.
y
=
af
.
y
;
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
of
.
z
=
af
.
z
;
#include "kCalculateObcGbsaForces2.h"
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
sA
[
threadIdx
.
x
].
wy
+
tgx
+
(
sA
[
threadIdx
.
x
].
wx
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA
[
threadIdx
.
x
].
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces2_12_kernel
();
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
{
//printf("kCalculateObcGbsaForces2\n");
//printf("kCalculateObcGbsaForces2\n");
if
(
gpu
->
sm_version
<
SM_12
)
size_t
numWithInteractions
;
kCalculateObcGbsaForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
else
else
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
kCalculateObcGbsaN2Forces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
if
(
0
){
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
workUnits
);
static
int
step
=
0
;
break
;
//int numPrint = -1;
case
CUTOFF
:
step
++
;
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
if
(
gpu
->
bOutputBufferPerWarp
)
//gpuDumpCoordinates( gpu );
kCalculateObcGbsaCutoffByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
kReduceBornSumAndForces
(
gpu
);
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
gpuDumpObcLoop1
(
gpu
);
else
}
kCalculateObcGbsaCutoffForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
case
PERIODIC
:
numWithInteractions
=
gpu
->
psInteractionCount
->
_pSysData
[
0
];
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
else
kCalculateObcGbsaPeriodicForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
numWithInteractions
);
break
;
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
}
}
platforms/cuda/src/kernels/kForces.cu
View file @
1010df33
...
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
...
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
__global__
void
kClearForces_kernel
()
__global__
void
kClearForces_kernel
()
{
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
4
*
cSim
.
outputBuffers
)
while
(
pos
<
cSim
.
stride
*
cSim
.
outputBuffers
)
{
{
((
float
*
)
cSim
.
pForce4
)
[
pos
]
=
0.0
f
;
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
)
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
}
}
...
...
platforms/cuda/src/kernels/kVerletUpdate.cu
View file @
1010df33
...
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
...
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
__global__
void
kVerletUpdatePart1_kernel
()
__global__
void
kVerletUpdatePart1_kernel
()
{
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
...
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
__global__
void
kVerletUpdatePart2_kernel
()
__global__
void
kVerletUpdatePart2_kernel
()
{
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
...
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
extern
__shared__
float3
sCM
[];
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
while
(
pos
<
cSim
.
atoms
)
{
{
...
...
platforms/reference/src/ReferenceKernels.cpp
View file @
1010df33
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2008 Stanford University and the Authors.
*
* Portions copyright (c) 2008
-2009
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
...
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
obcParameters
->
setScaledRadiusFactors
(
scaleFactors
);
obcParameters
->
setScaledRadiusFactors
(
scaleFactors
);
obcParameters
->
setSolventDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSolventDielectric
())
);
obcParameters
->
setSolventDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSolventDielectric
())
);
obcParameters
->
setSoluteDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSoluteDielectric
())
);
obcParameters
->
setSoluteDielectric
(
static_cast
<
RealOpenMM
>
(
force
.
getSoluteDielectric
())
);
// If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
for
(
int
i
=
0
;
i
<
system
.
getNumForces
();
i
++
)
{
const
NonbondedForce
*
nonbonded
=
dynamic_cast
<
const
NonbondedForce
*>
(
&
system
.
getForce
(
i
));
if
(
nonbonded
!=
NULL
)
{
if
(
nonbonded
->
getNonbondedMethod
()
!=
NonbondedForce
::
NoCutoff
)
obcParameters
->
setUseCutoff
(
nonbonded
->
getCutoffDistance
());
if
(
nonbonded
->
getNonbondedMethod
()
==
NonbondedForce
::
CutoffPeriodic
)
{
Vec3
boxVectors
[
3
];
nonbonded
->
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
RealOpenMM
periodicBoxSize
[
3
];
periodicBoxSize
[
0
]
=
(
RealOpenMM
)
boxVectors
[
0
][
0
];
periodicBoxSize
[
1
]
=
(
RealOpenMM
)
boxVectors
[
1
][
1
];
periodicBoxSize
[
2
]
=
(
RealOpenMM
)
boxVectors
[
2
][
2
];
obcParameters
->
setPeriodic
(
periodicBoxSize
);
}
break
;
}
}
obc
=
new
CpuObc
(
obcParameters
);
obc
=
new
CpuObc
(
obcParameters
);
obc
->
setIncludeAceApproximation
(
true
);
obc
->
setIncludeAceApproximation
(
true
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment