Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b0161939
Commit
b0161939
authored
Oct 05, 2012
by
Peter Eastman
Browse files
Eliminated mixed precision accumulation for nonbonded forces, which had negligible benefit
parent
c34bd3b2
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
9 additions
and
51 deletions
+9
-51
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+2
-26
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+0
-12
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+3
-5
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+2
-4
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+2
-4
No files found.
platforms/cuda/src/CudaKernels.cpp
View file @
b0161939
...
@@ -2040,18 +2040,6 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -2040,18 +2040,6 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
stringstream
defineAccum
;
if
(
cu
.
getUseMixedPrecision
())
{
defineAccum
<<
"typedef double accum;
\n
"
;
defineAccum
<<
"typedef double4 accum4;
\n
"
;
defines
[
"make_accum4"
]
=
"make_double4"
;
}
else
{
defineAccum
<<
"typedef real accum;
\n
"
;
defineAccum
<<
"typedef real4 accum4;
\n
"
;
defines
[
"make_accum4"
]
=
"make_real4"
;
}
replacements
[
"DEFINE_ACCUM"
]
=
defineAccum
.
str
();
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replaceStrings
(
CudaKernelSources
::
gbsaObc1
,
replacements
),
defines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replaceStrings
(
CudaKernelSources
::
gbsaObc1
,
replacements
),
defines
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
computeSumArgs
.
push_back
(
&
bornSum
->
getDevicePointer
());
computeSumArgs
.
push_back
(
&
bornSum
->
getDevicePointer
());
...
@@ -2534,9 +2522,9 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
...
@@ -2534,9 +2522,9 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
extraArgs
<<
", unsigned long long* __restrict__ derivBuffers"
;
extraArgs
<<
", unsigned long long* __restrict__ derivBuffers"
;
for
(
int
i
=
0
;
i
<
force
.
getNumComputedValues
();
i
++
)
{
for
(
int
i
=
0
;
i
<
force
.
getNumComputedValues
();
i
++
)
{
string
index
=
cu
.
intToString
(
i
+
1
);
string
index
=
cu
.
intToString
(
i
+
1
);
atomParams
<<
"
accum
deriv"
<<
index
<<
";
\n
"
;
atomParams
<<
"
real
deriv"
<<
index
<<
";
\n
"
;
clearLocal
<<
"localData[localAtomIndex].deriv"
<<
index
<<
" = 0;
\n
"
;
clearLocal
<<
"localData[localAtomIndex].deriv"
<<
index
<<
" = 0;
\n
"
;
declare1
<<
"
accum
deriv"
<<
index
<<
"_1 = 0;
\n
"
;
declare1
<<
"
real
deriv"
<<
index
<<
"_1 = 0;
\n
"
;
load2
<<
"real deriv"
<<
index
<<
"_2 = 0;
\n
"
;
load2
<<
"real deriv"
<<
index
<<
"_2 = 0;
\n
"
;
recordDeriv
<<
"localData[atom2].deriv"
<<
index
<<
" += deriv"
<<
index
<<
"_2;
\n
"
;
recordDeriv
<<
"localData[atom2].deriv"
<<
index
<<
" += deriv"
<<
index
<<
"_2;
\n
"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")
\n
"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")
\n
"
;
...
@@ -2555,18 +2543,6 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
...
@@ -2555,18 +2543,6 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
map
<
string
,
string
>
defines
;
map
<
string
,
string
>
defines
;
stringstream
defineAccum
;
if
(
cu
.
getUseMixedPrecision
())
{
defineAccum
<<
"typedef double accum;
\n
"
;
defineAccum
<<
"typedef double3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_double3"
;
}
else
{
defineAccum
<<
"typedef real accum;
\n
"
;
defineAccum
<<
"typedef real3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_real3"
;
}
replacements
[
"DEFINE_ACCUM"
]
=
defineAccum
.
str
();
if
(
useCutoff
)
if
(
useCutoff
)
defines
[
"USE_CUTOFF"
]
=
"1"
;
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
b0161939
...
@@ -450,18 +450,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -450,18 +450,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
stringstream
defineAccum
;
if
(
context
.
getUseMixedPrecision
())
{
defineAccum
<<
"typedef double accum;
\n
"
;
defineAccum
<<
"typedef double3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_double3"
;
}
else
{
defineAccum
<<
"typedef real accum;
\n
"
;
defineAccum
<<
"typedef real3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_real3"
;
}
replacements
[
"DEFINE_ACCUM"
]
=
defineAccum
.
str
();
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
b0161939
...
@@ -2,11 +2,9 @@
...
@@ -2,11 +2,9 @@
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define TILE_SIZE 32
#define TILE_SIZE 32
DEFINE_ACCUM
typedef
struct
{
typedef
struct
{
real4
posq
;
real4
posq
;
accum
3
force
;
real
3
force
;
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
#ifdef NEED_PADDING
float
padding
;
float
padding
;
...
@@ -47,7 +45,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -47,7 +45,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
accum
3
force
=
make_
accum
3
(
0
);
real
3
force
=
make_
real
3
(
0
);
DECLARE_ATOM1_DERIVATIVES
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
...
@@ -143,7 +141,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -143,7 +141,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
localData
[
localAtomIndex
].
force
=
make_
accum
3
(
0
);
localData
[
localAtomIndex
].
force
=
make_
real
3
(
0
);
CLEAR_LOCAL_DERIVATIVES
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
...
...
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
b0161939
...
@@ -4,8 +4,6 @@
...
@@ -4,8 +4,6 @@
#define TILE_SIZE 32
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
DEFINE_ACCUM
/**
/**
* Reduce the Born sums to compute the Born radii.
* Reduce the Born sums to compute the Born radii.
*/
*/
...
@@ -333,7 +331,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -333,7 +331,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
typedef
struct
{
typedef
struct
{
real
x
,
y
,
z
;
real
x
,
y
,
z
;
real
q
;
real
q
;
accum
fx
,
fy
,
fz
,
fw
;
real
fx
,
fy
,
fz
,
fw
;
real
bornRadius
;
real
bornRadius
;
}
AtomData2
;
}
AtomData2
;
...
@@ -374,7 +372,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -374,7 +372,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
accum
4
force
=
make_
accum
4
(
0
);
real
4
force
=
make_
real
4
(
0
);
if
(
pos
<
end
)
{
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
b0161939
#define TILE_SIZE 32
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
DEFINE_ACCUM
typedef
struct
{
typedef
struct
{
real
x
,
y
,
z
;
real
x
,
y
,
z
;
real
q
;
real
q
;
accum
fx
,
fy
,
fz
;
real
fx
,
fy
,
fz
;
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
#ifndef PARAMETER_SIZE_IS_EVEN
real
padding
;
real
padding
;
...
@@ -49,7 +47,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -49,7 +47,7 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
accum
3
force
=
make_
accum
3
(
0
);
real
3
force
=
make_
real
3
(
0
);
if
(
pos
<
end
)
{
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment