Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
18fb6efc
Commit
18fb6efc
authored
Jul 10, 2012
by
Peter Eastman
Browse files
Implemented mixed precision mode
parent
76716f1c
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
33 deletions
+93
-33
platforms/cuda2/src/CudaKernels.cpp
platforms/cuda2/src/CudaKernels.cpp
+29
-8
platforms/cuda2/src/CudaNonbondedUtilities.cpp
platforms/cuda2/src/CudaNonbondedUtilities.cpp
+13
-1
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+6
-6
platforms/cuda2/src/kernels/customGBEnergyN2.cu
platforms/cuda2/src/kernels/customGBEnergyN2.cu
+13
-6
platforms/cuda2/src/kernels/gbsaObc1.cu
platforms/cuda2/src/kernels/gbsaObc1.cu
+4
-2
platforms/cuda2/src/kernels/nonbonded.cu
platforms/cuda2/src/kernels/nonbonded.cu
+28
-10
No files found.
platforms/cuda2/src/CudaKernels.cpp
View file @
18fb6efc
...
...
@@ -1977,7 +1977,20 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
gbsaObc1
,
defines
);
map<string, string> replacements;
stringstream defineAccum;
if (cu.getAccumulateInDouble()) {
defineAccum << "typedef double accum;\n";
defineAccum << "typedef double4 accum4;\n";
defines["make_accum4"] = "make_double4";
}
else {
defineAccum << "typedef real accum;\n";
defineAccum << "typedef real4 accum4;\n";
defines["make_accum4"] = "make_real4";
}
replacements["DEFINE_ACCUM"] = defineAccum.str();
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum");
computeSumArgs.push_back(&bornSum->getDevicePointer());
computeSumArgs.push_back(&cu.getPosq().getDevicePointer());
...
...
@@ -2422,7 +2435,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
map<string, string> replacements;
string n2EnergyStr = n2EnergySource.str();
replacements["COMPUTE_INTERACTION"] = n2EnergyStr;
stringstream
extraArgs
,
atomParams
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
declare1
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2;
if (force.getNumGlobalParameters() > 0)
extraArgs << ", const float* globals";
pairEnergyUsesParam.resize(params->getBuffers().size(), false);
...
...
@@ -2459,15 +2472,13 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
extraArgs << ", unsigned long long* __restrict__ derivBuffers";
for (int i = 0; i < force.getNumComputedValues(); i++) {
string index = cu.intToString(i+1);
atomParams
<<
"
real
deriv"
<<
index
<<
";
\n
"
;
atomParams << "
accum
deriv" << index << ";\n";
clearLocal << "localData[localAtomIndex].deriv" << index << " = 0;\n";
declare1
<<
"
real
deriv"
<<
index
<<
"_1 = 0;
\n
"
;
declare1 << "
accum
deriv" << index << "_1 = 0;\n";
load2 << "real deriv" << index << "_2 = 0;\n";
recordDeriv << "localData[atom2].deriv" << index << " += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
declareTemps
<<
"__local real tempDerivBuffer"
<<
index
<<
"[64];
\n
"
;
setTemps
<<
"tempDerivBuffer"
<<
index
<<
"[threadIdx.x] = deriv"
<<
index
<<
"_1;
\n
"
;
atomParamSize++;
}
replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
...
...
@@ -2481,9 +2492,19 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
replacements
[
"DECLARE_TEMP_BUFFERS"
]
=
declareTemps
.
str
();
replacements
[
"SET_TEMP_BUFFERS"
]
=
setTemps
.
str
();
map<string, string> defines;
stringstream defineAccum;
if (cu.getAccumulateInDouble()) {
defineAccum << "typedef double accum;\n";
defineAccum << "typedef double3 accum3;\n";
defines["make_accum3"] = "make_double3";
}
else {
defineAccum << "typedef real accum;\n";
defineAccum << "typedef real3 accum3;\n";
defines["make_accum3"] = "make_real3";
}
replacements["DEFINE_ACCUM"] = defineAccum.str();
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
...
...
platforms/cuda2/src/CudaNonbondedUtilities.cpp
View file @
18fb6efc
...
...
@@ -445,10 +445,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
()
&&
!
context
.
getAccumulateInDouble
()
)
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"ENABLE_SHUFFLE"
]
=
"1"
;
stringstream
defineAccum
;
if
(
context
.
getAccumulateInDouble
())
{
defineAccum
<<
"typedef double accum;
\n
"
;
defineAccum
<<
"typedef double3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_double3"
;
}
else
{
defineAccum
<<
"typedef real accum;
\n
"
;
defineAccum
<<
"typedef real3 accum3;
\n
"
;
defines
[
"make_accum3"
]
=
"make_real3"
;
}
replacements
[
"DEFINE_ACCUM"
]
=
defineAccum
.
str
();
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
18fb6efc
platforms/cuda2/src/kernels/customGBEnergyN2.cu
View file @
18fb6efc
...
...
@@ -2,10 +2,11 @@
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define TILE_SIZE 32
DEFINE_ACCUM
typedef
struct
{
real4
posq
;
real
3
force
;
accum
3
force
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
...
...
@@ -46,7 +47,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real
3
force
=
make_
real
3
(
0
);
accum
3
force
=
make_
accum
3
(
0
);
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
...
...
@@ -122,7 +123,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
}
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
-=
delta
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
...
...
@@ -140,7 +143,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_
real
3
(
0
);
localData
[
localAtomIndex
].
force
=
make_
accum
3
(
0
);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
...
...
@@ -185,9 +188,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
-=
delta
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
+=
delta
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
...
...
platforms/cuda2/src/kernels/gbsaObc1.cu
View file @
18fb6efc
...
...
@@ -4,6 +4,8 @@
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
DEFINE_ACCUM
/**
* Reduce the Born sums to compute the Born radii.
*/
...
...
@@ -331,7 +333,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
,
fw
;
accum
fx
,
fy
,
fz
,
fw
;
real
bornRadius
;
}
AtomData2
;
...
...
@@ -372,7 +374,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real
4
force
=
make_
real
4
(
0
);
accum
4
force
=
make_
accum
4
(
0
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
...
...
platforms/cuda2/src/kernels/nonbonded.cu
View file @
18fb6efc
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
DEFINE_ACCUM
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
;
accum
fx
,
fy
,
fz
;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real
padding
;
...
...
@@ -47,7 +49,7 @@ extern "C" __global__ void computeNonbonded(
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real
3
force
=
make_
real
3
(
0
);
accum
3
force
=
make_
accum
3
(
0
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
...
...
@@ -124,9 +126,13 @@ extern "C" __global__ void computeNonbonded(
COMPUTE_INTERACTION
energy
+=
0.5
f
*
tempEnergy
;
#ifdef USE_SYMMETRIC
force
-=
delta
*
dEdR
;
force
.
x
-=
delta
.
x
*
dEdR
;
force
.
y
-=
delta
.
y
*
dEdR
;
force
.
z
-=
delta
.
z
*
dEdR
;
#else
force
-=
dEdR1
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
...
...
@@ -191,7 +197,9 @@ extern "C" __global__ void computeNonbonded(
#ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
-=
delta
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
...
...
@@ -203,7 +211,9 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
}
#else
force
-=
dEdR1
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
dEdR2
.
x
+=
__shfl_xor
(
dEdR2
.
x
,
i
,
32
);
dEdR2
.
y
+=
__shfl_xor
(
dEdR2
.
y
,
i
,
32
);
...
...
@@ -218,12 +228,16 @@ extern "C" __global__ void computeNonbonded(
#else
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
-=
delta
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
tempBuffer
[
bufferIndex
]
=
delta
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
delta
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
delta
.
z
;
#else
force
-=
dEdR1
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
tempBuffer
[
bufferIndex
]
=
dEdR2
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
dEdR2
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
dEdR2
.
z
;
...
...
@@ -287,12 +301,16 @@ extern "C" __global__ void computeNonbonded(
energy
+=
tempEnergy
;
#ifdef USE_SYMMETRIC
delta
*=
dEdR
;
force
-=
delta
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
#else
force
-=
dEdR1
;
force
.
x
-=
dEdR1
.
x
;
force
.
y
-=
dEdR1
.
y
;
force
.
z
-=
dEdR1
.
z
;
localData
[
tbx
+
tj
].
fx
+=
dEdR2
.
x
;
localData
[
tbx
+
tj
].
fy
+=
dEdR2
.
y
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment