Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f6346776
Commit
f6346776
authored
Jun 27, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform: CustomGBForce
parent
5feaa943
Changes
13
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1911 additions
and
995 deletions
+1911
-995
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+1
-0
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+8
-2
platforms/cuda2/src/CudaKernelFactory.cpp
platforms/cuda2/src/CudaKernelFactory.cpp
+2
-2
platforms/cuda2/src/CudaKernels.cpp
platforms/cuda2/src/CudaKernels.cpp
+831
-936
platforms/cuda2/src/CudaKernels.h
platforms/cuda2/src/CudaKernels.h
+52
-52
platforms/cuda2/src/CudaNonbondedUtilities.cpp
platforms/cuda2/src/CudaNonbondedUtilities.cpp
+2
-3
platforms/cuda2/src/kernels/customGBChainRule.cu
platforms/cuda2/src/kernels/customGBChainRule.cu
+19
-0
platforms/cuda2/src/kernels/customGBEnergyN2.cu
platforms/cuda2/src/kernels/customGBEnergyN2.cu
+224
-0
platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
+20
-0
platforms/cuda2/src/kernels/customGBGradientChainRule.cu
platforms/cuda2/src/kernels/customGBGradientChainRule.cu
+16
-0
platforms/cuda2/src/kernels/customGBValueN2.cu
platforms/cuda2/src/kernels/customGBValueN2.cu
+250
-0
platforms/cuda2/src/kernels/customGBValuePerParticle.cu
platforms/cuda2/src/kernels/customGBValuePerParticle.cu
+17
-0
platforms/cuda2/tests/TestCudaCustomGBForce.cpp
platforms/cuda2/tests/TestCudaCustomGBForce.cpp
+469
-0
No files found.
platforms/cuda2/src/CudaContext.cpp
View file @
f6346776
...
@@ -123,6 +123,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -123,6 +123,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int
major
,
minor
;
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
computeCapability
=
major
+
0.1
*
minor
;
defaultOptimizationOptions
=
"--use_fast_math"
;
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
if
(
useBlockingSync
)
...
...
platforms/cuda2/src/CudaContext.h
View file @
f6346776
...
@@ -105,10 +105,16 @@ public:
...
@@ -105,10 +105,16 @@ public:
CUdevice
getDevice
()
{
CUdevice
getDevice
()
{
return
device
;
return
device
;
}
}
/**
* Get the compute capability of the device associated with this object.
*/
double
getComputeCapability
()
const
{
return
computeCapability
;
}
/**
/**
* Get the index of the CUdevice associated with this object.
* Get the index of the CUdevice associated with this object.
*/
*/
int
getDeviceIndex
()
{
int
getDeviceIndex
()
const
{
return
deviceIndex
;
return
deviceIndex
;
}
}
/**
/**
...
@@ -444,7 +450,7 @@ private:
...
@@ -444,7 +450,7 @@ private:
void
validateMolecules
();
void
validateMolecules
();
static
bool
hasInitializedCuda
;
static
bool
hasInitializedCuda
;
const
System
&
system
;
const
System
&
system
;
double
time
;
double
time
,
computeCapability
;
CudaPlatform
::
PlatformData
&
platformData
;
CudaPlatform
::
PlatformData
&
platformData
;
int
deviceIndex
;
int
deviceIndex
;
int
contextIndex
;
int
contextIndex
;
...
...
platforms/cuda2/src/CudaKernelFactory.cpp
View file @
f6346776
...
@@ -98,8 +98,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
...
@@ -98,8 +98,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return
new
CudaCalcCustomNonbondedForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
return
new
CudaCalcCustomNonbondedForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
if
(
name
==
CalcGBSAOBCForceKernel
::
Name
())
if
(
name
==
CalcGBSAOBCForceKernel
::
Name
())
return
new
CudaCalcGBSAOBCForceKernel
(
name
,
platform
,
cu
);
return
new
CudaCalcGBSAOBCForceKernel
(
name
,
platform
,
cu
);
//
if (name == CalcCustomGBForceKernel::Name())
if
(
name
==
CalcCustomGBForceKernel
::
Name
())
//
return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
return
new
CudaCalcCustomGBForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
if
(
name
==
CalcCustomExternalForceKernel
::
Name
())
if
(
name
==
CalcCustomExternalForceKernel
::
Name
())
return
new
CudaCalcCustomExternalForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
return
new
CudaCalcCustomExternalForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
if
(
name
==
CalcCustomHbondForceKernel
::
Name
())
if
(
name
==
CalcCustomHbondForceKernel
::
Name
())
...
...
platforms/cuda2/src/CudaKernels.cpp
View file @
f6346776
This diff is collapsed.
Click to expand it.
platforms/cuda2/src/CudaKernels.h
View file @
f6346776
...
@@ -715,58 +715,58 @@ private:
...
@@ -715,58 +715,58 @@ private:
std
::
vector
<
void
*>
computeSumArgs
,
force1Args
;
std
::
vector
<
void
*>
computeSumArgs
,
force1Args
;
};
};
//
/**
/**
//
* This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
* This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
//
*/
*/
//
class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
class
CudaCalcCustomGBForceKernel
:
public
CalcCustomGBForceKernel
{
//
public:
public:
//
CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
CudaCalcCustomGBForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcCustomGBForceKernel
(
name
,
platform
),
//
hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
hasInitializedKernels
(
false
),
cu
(
cu
),
params
(
NULL
),
computedValues
(
NULL
),
energyDerivs
(
NULL
),
longEnergyDerivs
(
NULL
),
globals
(
NULL
),
//
valueBuffers(NULL),
longValueBuffers(NULL),
tabulatedFunctionParams(NULL), system(system) {
valueBuffers
(
NULL
),
tabulatedFunctionParams
(
NULL
),
system
(
system
)
{
//
}
}
//
~CudaCalcCustomGBForceKernel();
~
CudaCalcCustomGBForceKernel
();
//
/**
/**
//
* Initialize the kernel.
* Initialize the kernel.
//
*
*
//
* @param system the System this kernel will be applied to
* @param system the System this kernel will be applied to
//
* @param force the CustomGBForce this kernel will be used for
* @param force the CustomGBForce this kernel will be used for
//
*/
*/
//
void initialize(const System& system, const CustomGBForce& force);
void
initialize
(
const
System
&
system
,
const
CustomGBForce
&
force
);
//
/**
/**
//
* Execute the kernel to calculate the forces and/or energy.
* Execute the kernel to calculate the forces and/or energy.
//
*
*
//
* @param context the context in which to execute this kernel
* @param context the context in which to execute this kernel
//
* @param includeForces true if forces should be calculated
* @param includeForces true if forces should be calculated
//
* @param includeEnergy true if the energy should be calculated
* @param includeEnergy true if the energy should be calculated
//
* @return the potential energy due to the force
* @return the potential energy due to the force
//
*/
*/
//
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
//
/**
/**
//
* Copy changed parameters over to a context.
* Copy changed parameters over to a context.
//
*
*
//
* @param context the context to copy parameters to
* @param context the context to copy parameters to
//
* @param force the CustomGBForce to copy the parameters from
* @param force the CustomGBForce to copy the parameters from
//
*/
*/
//
void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
void
copyParametersToContext
(
ContextImpl
&
context
,
const
CustomGBForce
&
force
);
//
private:
private:
//
bool hasInitializedKernels, needParameterGradient;
bool
hasInitializedKernels
,
needParameterGradient
;
//
int maxTiles, numComputedValues;
int
maxTiles
,
numComputedValues
;
//
CudaContext& cu;
CudaContext
&
cu
;
//
CudaParameterSet* params;
CudaParameterSet
*
params
;
//
CudaParameterSet* computedValues;
CudaParameterSet
*
computedValues
;
//
CudaParameterSet* energyDerivs;
CudaParameterSet
*
energyDerivs
;
//
CudaArray
<cl_long>
* longEnergyDerivs;
CudaArray
*
longEnergyDerivs
;
//
CudaArray
<cl_float>
* globals;
CudaArray
*
globals
;
//
CudaArray
<cl_float>
* valueBuffers;
CudaArray
*
valueBuffers
;
//
CudaArray
<cl_long>* longValueBuffer
s;
CudaArray
*
tabulatedFunctionParam
s
;
//
CudaArray<mm_float4>* tabulatedFunction
Params;
std
::
vector
<
std
::
string
>
global
Param
Name
s
;
//
std::vector<
std::string
> globalParam
Nam
es;
std
::
vector
<
float
>
globalParam
Valu
es
;
//
std::vector<
cl_float> globalParamValue
s;
std
::
vector
<
CudaArray
*>
tabulatedFunction
s
;
//
std::vector<
CudaArray<mm_float4>*> tabulatedFunctions
;
std
::
vector
<
bool
>
pairValueUsesParam
,
pairEnergyUsesParam
,
pairEnergyUsesValue
;
//
std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue
;
System
&
system
;
//
System& system
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
//
CUfunction
pairValue
Kernel
, perParticleValue
Kernel
, pairEnergy
Kernel
, perParticleEnergy
Kernel
, gradientChainRule
Kernel
;
std
::
vector
<
void
*>
pairValue
Args
,
perParticleValue
Args
,
pairEnergy
Args
,
perParticleEnergy
Args
,
gradientChainRule
Args
;
//
};
};
/**
/**
* This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
* This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
...
...
platforms/cuda2/src/CudaNonbondedUtilities.cpp
View file @
f6346776
...
@@ -52,7 +52,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
...
@@ -52,7 +52,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
numForceThreadBlocks
=
2
*
multiprocessors
;
numForceThreadBlocks
=
2
*
multiprocessors
;
forceThreadBlockSize
=
256
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
)
;
}
}
CudaNonbondedUtilities
::~
CudaNonbondedUtilities
()
{
CudaNonbondedUtilities
::~
CudaNonbondedUtilities
()
{
...
@@ -441,8 +441,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -441,8 +441,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
string
file
;
CUmodule
program
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUmodule
program
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
nonbonded
,
replacements
),
defines
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
CUfunction
kernel
=
context
.
getKernel
(
program
,
"computeNonbonded"
);
// Set arguments to the Kernel.
// Set arguments to the Kernel.
...
...
platforms/cuda2/src/kernels/customGBChainRule.cu
0 → 100644
View file @
f6346776
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
#ifdef USE_SYMMETRIC
real
tempForce
=
0
;
#else
real3
tempForce1
=
make_real3
(
0
);
real3
tempForce2
=
make_real3
(
0
);
#endif
COMPUTE_FORCE
#ifdef USE_SYMMETRIC
dEdR
+=
tempForce
*
invR
;
#else
dEdR1
+=
tempForce1
;
dEdR2
+=
tempForce2
;
#endif
}
platforms/cuda2/src/kernels/customGBEnergyN2.cu
0 → 100644
View file @
f6346776
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0xFFFFFFFF)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
real3
force
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
#endif
}
AtomData
;
/**
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interactionFlags
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
-=
delta
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
-=
delta
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
+=
delta
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
lasty
=
y
;
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
STORE_DERIVATIVES_1
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0xFFFFFFFF
)));
STORE_DERIVATIVES_2
}
pos
++
;
}
while
(
pos
<
end
);
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
0 → 100644
View file @
f6346776
/**
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the derivatives
LOAD_DERIVATIVES
// Now calculate the per-particle energy terms.
real4
pos
=
posq
[
index
];
real3
force
=
make_real3
(
0
,
0
,
0
);
COMPUTE_ENERGY
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda2/src/kernels/customGBGradientChainRule.cu
0 → 100644
View file @
f6346776
/**
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
extern
"C"
__global__
void
computeGradientChainRuleTerms
(
long
long
*
__restrict__
forceBuffers
,
const
real4
*
__restrict__
posq
PARAMETER_ARGUMENTS
)
{
const
real
scale
=
RECIP
((
real
)
0xFFFFFFFF
);
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
index
];
real3
force
=
make_real3
(
scale
*
forceBuffers
[
index
],
scale
*
forceBuffers
[
index
+
PADDED_NUM_ATOMS
],
scale
*
forceBuffers
[
index
+
PADDED_NUM_ATOMS
*
2
]);
COMPUTE_FORCES
forceBuffers
[
index
]
=
(
long
long
)
(
force
.
x
*
0xFFFFFFFF
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
force
.
y
*
0xFFFFFFFF
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
force
.
z
*
0xFFFFFFFF
);
}
}
platforms/cuda2/src/kernels/customGBValueN2.cu
0 → 100644
View file @
f6346776
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
real
value
,
temp
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
#endif
}
AtomData
;
/**
* Compute a value based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interactionFlags
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real
value
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
}
}
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
posq
=
posq
[
j
];
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
threadIdx
.
x
].
value
=
0
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
}
localData
[
threadIdx
.
x
].
temp
=
tempValue2
;
// Sum the forces on atom2.
if
(
tgx
%
4
==
0
)
localData
[
threadIdx
.
x
].
temp
+=
localData
[
threadIdx
.
x
+
1
].
temp
+
localData
[
threadIdx
.
x
+
2
].
temp
+
localData
[
threadIdx
.
x
+
3
].
temp
;
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
value
+=
localData
[
threadIdx
.
x
].
temp
+
localData
[
threadIdx
.
x
+
4
].
temp
+
localData
[
threadIdx
.
x
+
8
].
temp
+
localData
[
threadIdx
.
x
+
12
].
temp
+
localData
[
threadIdx
.
x
+
16
].
temp
+
localData
[
threadIdx
.
x
+
20
].
temp
+
localData
[
threadIdx
.
x
+
24
].
temp
+
localData
[
threadIdx
.
x
+
28
].
temp
;
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0xFFFFFFFF
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0xFFFFFFFF
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
platforms/cuda2/src/kernels/customGBValuePerParticle.cu
0 → 100644
View file @
f6346776
/**
* Reduce a pairwise computed value, and compute per-particle values.
*/
extern
"C"
__global__
void
computePerParticleValues
(
real4
*
posq
,
long
long
*
valueBuffers
PARAMETER_ARGUMENTS
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the pairwise value
real
sum
=
valueBuffers
[
index
]
/
(
real
)
0xFFFFFFFF
;
// Now calculate other values
real4
pos
=
posq
[
index
];
COMPUTE_VALUES
}
}
platforms/cuda2/tests/TestCudaCustomGBForce.cpp
0 → 100644
View file @
f6346776
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment