Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2e451b9d
Commit
2e451b9d
authored
Dec 13, 2012
by
Peter Eastman
Browse files
Deleted the old CUDA platform
parent
352e2fc7
Changes
147
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
4616 deletions
+0
-4616
platforms/cuda-old/src/kernels/kApplyConstraints.cu
platforms/cuda-old/src/kernels/kApplyConstraints.cu
+0
-65
platforms/cuda-old/src/kernels/kBrownianUpdate.cu
platforms/cuda-old/src/kernels/kBrownianUpdate.cu
+0
-152
platforms/cuda-old/src/kernels/kCCMA.cu
platforms/cuda-old/src/kernels/kCCMA.cu
+0
-213
platforms/cuda-old/src/kernels/kCalculateAndersenThermostat.cu
...orms/cuda-old/src/kernels/kCalculateAndersenThermostat.cu
+0
-102
platforms/cuda-old/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
.../cuda-old/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+0
-161
platforms/cuda-old/src/kernels/kCalculateCDLJEwaldReciprocal.h
...orms/cuda-old/src/kernels/kCalculateCDLJEwaldReciprocal.h
+0
-87
platforms/cuda-old/src/kernels/kCalculateCDLJForces.cu
platforms/cuda-old/src/kernels/kCalculateCDLJForces.cu
+0
-209
platforms/cuda-old/src/kernels/kCalculateCDLJForces.h
platforms/cuda-old/src/kernels/kCalculateCDLJForces.h
+0
-560
platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.cu
...orms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+0
-219
platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.h
...forms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.h
+0
-590
platforms/cuda-old/src/kernels/kCalculateCMAPTorsionForces.cu
...forms/cuda-old/src/kernels/kCalculateCMAPTorsionForces.cu
+0
-250
platforms/cuda-old/src/kernels/kCalculateCustomAngleForces.cu
...forms/cuda-old/src/kernels/kCalculateCustomAngleForces.cu
+0
-164
platforms/cuda-old/src/kernels/kCalculateCustomBondForces.cu
platforms/cuda-old/src/kernels/kCalculateCustomBondForces.cu
+0
-143
platforms/cuda-old/src/kernels/kCalculateCustomExternalForces.cu
...ms/cuda-old/src/kernels/kCalculateCustomExternalForces.cu
+0
-131
platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.cu
...s/cuda-old/src/kernels/kCalculateCustomNonbondedForces.cu
+0
-200
platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.h
...ms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.h
+0
-398
platforms/cuda-old/src/kernels/kCalculateCustomTorsionForces.cu
...rms/cuda-old/src/kernels/kCalculateCustomTorsionForces.cu
+0
-203
platforms/cuda-old/src/kernels/kCalculateGBVIAux.h
platforms/cuda-old/src/kernels/kCalculateGBVIAux.h
+0
-182
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.cu
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.cu
+0
-341
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.h
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.h
+0
-246
No files found.
Too many changes to show.
To preserve performance only
147 of 147+
files are displayed.
Plain diff
Email patch
platforms/cuda-old/src/kernels/kApplyConstraints.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
__global__
void
kPrepareConstraints_kernel
(
int
numAtoms
,
float4
*
oldPos
,
float4
*
posq
,
float4
*
posqP
)
{
for
(
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
pos
=
posq
[
index
];
oldPos
[
index
]
=
pos
;
posqP
[
index
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
pos
.
w
);
}
}
__global__
void
kFinishConstraints_kernel
(
int
numAtoms
,
float4
*
posq
,
float4
*
posqP
)
{
for
(
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
pos
=
posq
[
index
];
float4
delta
=
posqP
[
index
];
posq
[
index
]
=
make_float4
(
pos
.
x
+
delta
.
x
,
pos
.
y
+
delta
.
y
,
pos
.
z
+
delta
.
z
,
pos
.
w
);
}
}
void
kApplyConstraints
(
gpuContext
gpu
)
{
kPrepareConstraints_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
(
gpu
->
natoms
,
gpu
->
sim
.
pOldPosq
,
gpu
->
sim
.
pPosq
,
gpu
->
sim
.
pPosqP
);
LAUNCHERROR
(
"kPrepareConstraints"
);
kApplyShake
(
gpu
);
kApplySettle
(
gpu
);
kApplyCCMA
(
gpu
);
kFinishConstraints_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
(
gpu
->
natoms
,
gpu
->
sim
.
pPosq
,
gpu
->
sim
.
pPosqP
);
LAUNCHERROR
(
"kFinishConstraints"
);
}
platforms/cuda-old/src/kernels/kBrownianUpdate.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
void
kBrownianUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
random4a
=
cSim
.
pRandom4
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
invMass
=
cSim
.
pVelm4
[
pos
].
w
;
float
forceScale
=
cSim
.
tauDeltaT
*
invMass
;
float
noiseScale
=
cSim
.
noiseAmplitude
*
sqrtf
(
invMass
);
cSim
.
pOldPosq
[
pos
]
=
apos
;
apos
.
x
=
force
.
x
*
forceScale
+
noiseScale
*
random4a
.
x
;
apos
.
y
=
force
.
y
*
forceScale
+
noiseScale
*
random4a
.
y
;
apos
.
z
=
force
.
z
*
forceScale
+
noiseScale
*
random4a
.
z
;
cSim
.
pPosqP
[
pos
]
=
apos
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kBrownianUpdatePart1
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart1\n");
kBrownianUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart1"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
void
kBrownianUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kBrownianUpdatePart2
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart2\n");
kBrownianUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart2"
);
// Update randoms if necessary
gpu
->
iterations
++
;
if
(
gpu
->
iterations
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
gpu
->
iterations
=
0
;
}
}
platforms/cuda-old/src/kernels/kCCMA.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <cuda.h>
#include <vector_functions.h>
#include <vector>
#include "gputypes.h"
using
namespace
std
;
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCCMASim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCCMASim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kComputeCCMAConstraintDirections
()
{
// Calculate the direction of each constraint.
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float4
oldPos1
=
cSim
.
pOldPosq
[
atoms
.
x
];
float4
oldPos2
=
cSim
.
pOldPosq
[
atoms
.
y
];
dir
.
x
=
oldPos1
.
x
-
oldPos2
.
x
;
dir
.
y
=
oldPos1
.
y
-
oldPos2
.
y
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
cSim
.
pCcmaDistance
[
index
]
=
dir
;
}
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kComputeCCMAConstraintForces
(
float4
*
atomPositions
,
bool
addOldPosition
)
{
__shared__
int
converged
;
float
lowerTol
=
1.0
f
-
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
float
upperTol
=
1.0
f
+
2.0
f
*
cSim
.
shakeTolerance
+
cSim
.
shakeTolerance
*
cSim
.
shakeTolerance
;
if
(
threadIdx
.
x
==
0
)
converged
=
1
;
__syncthreads
();
// Calculate the constraint force for each constraint.
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
atoms
=
cSim
.
pCcmaAtoms
[
index
];
float4
delta1
=
atomPositions
[
atoms
.
x
];
float4
delta2
=
atomPositions
[
atoms
.
y
];
float4
dir
=
cSim
.
pCcmaDistance
[
index
];
float3
rp_ij
=
make_float3
(
delta1
.
x
-
delta2
.
x
,
delta1
.
y
-
delta2
.
y
,
delta1
.
z
-
delta2
.
z
);
if
(
addOldPosition
)
{
rp_ij
.
x
+=
dir
.
x
;
rp_ij
.
y
+=
dir
.
y
;
rp_ij
.
z
+=
dir
.
z
;
}
float
rp2
=
rp_ij
.
x
*
rp_ij
.
x
+
rp_ij
.
y
*
rp_ij
.
y
+
rp_ij
.
z
*
rp_ij
.
z
;
float
dist2
=
dir
.
w
*
dir
.
w
;
float
diff
=
dist2
-
rp2
;
float
rrpr
=
rp_ij
.
x
*
dir
.
x
+
rp_ij
.
y
*
dir
.
y
+
rp_ij
.
z
*
dir
.
z
;
float
d_ij2
=
dir
.
x
*
dir
.
x
+
dir
.
y
*
dir
.
y
+
dir
.
z
*
dir
.
z
;
float
reducedMass
=
cSim
.
pCcmaReducedMass
[
index
];
cSim
.
pCcmaDelta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
*
diff
/
rrpr
:
0.0
f
);
// See whether it has converged.
if
(
converged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
converged
=
0
;
*
cSim
.
ccmaConvergedDeviceMarker
=
0
;
}
}
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kMultiplyByCCMAConstraintMatrix
()
{
if
(
*
cSim
.
ccmaConvergedDeviceMarker
)
return
;
// The constraint iteration has already converged
// Multiply by the inverse constraint matrix.
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
ccmaConstraints
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
sum
=
0.0
f
;
for
(
unsigned
int
i
=
0
;
;
i
++
)
{
unsigned
int
element
=
index
+
i
*
cSim
.
ccmaConstraints
;
unsigned
int
column
=
cSim
.
pConstraintMatrixColumn
[
element
];
if
(
column
>=
cSim
.
ccmaConstraints
)
break
;
sum
+=
cSim
.
pCcmaDelta1
[
column
]
*
cSim
.
pConstraintMatrixValue
[
element
];
}
cSim
.
pCcmaDelta2
[
index
]
=
sum
;
}
}
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
kUpdateCCMAAtomPositions
(
float4
*
atomPositions
,
int
iteration
)
{
if
(
*
cSim
.
ccmaConvergedDeviceMarker
)
return
;
// The constraint iteration has already converged.
float
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
for
(
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
cSim
.
atoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
atomPos
=
atomPositions
[
index
];
float
invMass
=
cSim
.
pVelm4
[
index
].
w
;
int
num
=
cSim
.
pCcmaNumAtomConstraints
[
index
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
int
constraint
=
cSim
.
pCcmaAtomConstraints
[
index
+
i
*
cSim
.
atoms
];
bool
forward
=
(
constraint
>
0
);
constraint
=
(
forward
?
constraint
-
1
:
-
constraint
-
1
);
float
constraintForce
=
damping
*
invMass
*
cSim
.
pCcmaDelta2
[
constraint
];
constraintForce
=
(
forward
?
constraintForce
:
-
constraintForce
);
float4
dir
=
cSim
.
pCcmaDistance
[
constraint
];
atomPos
.
x
+=
constraintForce
*
dir
.
x
;
atomPos
.
y
+=
constraintForce
*
dir
.
y
;
atomPos
.
z
+=
constraintForce
*
dir
.
z
;
}
atomPositions
[
index
]
=
atomPos
;
}
}
void
kApplyCCMA
(
gpuContext
gpu
,
float4
*
posq
,
bool
addOldPosition
)
{
kComputeCCMAConstraintDirections
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeCCMAConstraintDirections"
);
const
int
checkInterval
=
3
;
for
(
int
i
=
0
;
i
<
150
;
i
++
)
{
if
((
i
+
1
)
%
checkInterval
==
0
)
*
gpu
->
ccmaConvergedHostMarker
=
1
;
kComputeCCMAConstraintForces
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
,
gpu
->
sim
.
ccma_threads_per_block
*
sizeof
(
int
)
>>>
(
posq
,
addOldPosition
);
cudaEventRecord
(
gpu
->
ccmaEvent
,
0
);
kMultiplyByCCMAConstraintMatrix
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
,
gpu
->
sim
.
ccma_threads_per_block
*
sizeof
(
int
)
>>>
();
kUpdateCCMAAtomPositions
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
ccma_threads_per_block
>>>
(
posq
,
3
*
i
+
2
);
cudaEventSynchronize
(
gpu
->
ccmaEvent
);
if
((
i
+
1
)
%
checkInterval
==
0
&&
*
gpu
->
ccmaConvergedHostMarker
)
break
;
}
}
void
kApplyCCMA
(
gpuContext
gpu
)
{
// printf("kApplyCCMA\n");
if
(
gpu
->
sim
.
ccmaConstraints
>
0
)
kApplyCCMA
(
gpu
,
gpu
->
sim
.
pPosqP
,
true
);
}
platforms/cuda-old/src/kernels/kCalculateAndersenThermostat.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateAndersenThermostat_kernel
(
int
*
atomGroups
)
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
float
collisionProbability
=
1.0
f
-
exp
(
-
cSim
.
collisionFrequency
*
cSim
.
pStepSize
[
0
].
y
);
float
randomRange
=
erf
(
collisionProbability
/
sqrtf
(
2.0
f
));
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
selectRand
=
cSim
.
pRandom4
[
rpos
+
atomGroups
[
pos
]];
float4
velRand
=
cSim
.
pRandom4
[
rpos
+
pos
];
float
scale
=
(
selectRand
.
w
>
-
randomRange
&&
selectRand
.
w
<
randomRange
?
0.0
f
:
1.0
f
);
float
add
=
(
1.0
f
-
scale
)
*
sqrtf
(
cSim
.
kT
*
velocity
.
w
);
velocity
.
x
=
scale
*
velocity
.
x
+
add
*
velRand
.
x
;
velocity
.
y
=
scale
*
velocity
.
y
+
add
*
velRand
.
y
;
velocity
.
z
=
scale
*
velocity
.
z
+
add
*
velRand
.
z
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kCalculateAndersenThermostat
(
gpuContext
gpu
,
CUDAStream
<
int
>&
atomGroups
)
{
// printf("kCalculateAndersenThermostat\n");
kCalculateAndersenThermostat_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
(
atomGroups
.
_pDevData
);
LAUNCHERROR
(
"kCalculateAndersenThermostat"
);
// Update randoms if necessary
gpu
->
iterations
++
;
if
(
gpu
->
iterations
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
gpu
->
iterations
=
0
;
}
}
platforms/cuda-old/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Rossen P. Apostolov, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evaluating nonbonded forces using the
* Ewald summation method (Reciprocal space summation).
*/
/* Define multiply operations for floats */
__device__
float2
MultofFloat2
(
float2
a
,
float2
b
)
{
float2
c
;
c
.
x
=
a
.
x
*
b
.
x
-
a
.
y
*
b
.
y
;
c
.
y
=
a
.
x
*
b
.
y
+
a
.
y
*
b
.
x
;
return
c
;
}
__device__
float2
ConjMultofFloat2
(
float2
a
,
float2
b
)
{
float2
c
;
c
.
x
=
a
.
x
*
b
.
x
+
a
.
y
*
b
.
y
;
c
.
y
=
a
.
y
*
b
.
x
-
a
.
x
*
b
.
y
;
return
c
;
}
/**
* Precompute the cosine and sine sums which appear in each force term.
*/
__global__
void
kCalculateEwaldFastCosSinSums_kernel
()
{
const
float
epsilon
=
1
.
0
;
const
float
recipCoeff
=
cSim
.
epsfac
*
(
4
*
LOCAL_HACK_PI
/
cSim
.
cellVolume
/
epsilon
);
const
unsigned
int
ksizex
=
2
*
cSim
.
kmaxX
-
1
;
const
unsigned
int
ksizey
=
2
*
cSim
.
kmaxY
-
1
;
const
unsigned
int
ksizez
=
2
*
cSim
.
kmaxZ
-
1
;
const
unsigned
int
totalK
=
ksizex
*
ksizey
*
ksizez
;
unsigned
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float
energy
=
0
.
0
f
;
while
(
index
<
(
cSim
.
kmaxY
-
1
)
*
ksizez
+
cSim
.
kmaxZ
)
index
+=
blockDim
.
x
*
gridDim
.
x
;
while
(
index
<
totalK
)
{
// Find the wave vector (kx, ky, kz) this index corresponds to.
int
rx
=
index
/
(
ksizey
*
ksizez
);
int
remainder
=
index
-
rx
*
ksizey
*
ksizez
;
int
ry
=
remainder
/
ksizez
;
int
rz
=
remainder
-
ry
*
ksizez
-
cSim
.
kmaxZ
+
1
;
ry
+=
-
cSim
.
kmaxY
+
1
;
float
kx
=
rx
*
cSim
.
recipBoxSizeX
;
float
ky
=
ry
*
cSim
.
recipBoxSizeY
;
float
kz
=
rz
*
cSim
.
recipBoxSizeZ
;
// Compute the sum for this wave vector.
float2
sum
=
make_float2
(
0
.
0
f
,
0
.
0
f
);
for
(
int
atom
=
0
;
atom
<
cSim
.
atoms
;
atom
++
)
{
float4
apos
=
cSim
.
pPosq
[
atom
];
float
phase
=
apos
.
x
*
kx
;
float2
structureFactor
=
make_float2
(
cosf
(
phase
),
sinf
(
phase
));
phase
=
apos
.
y
*
ky
;
structureFactor
=
MultofFloat2
(
structureFactor
,
make_float2
(
cosf
(
phase
),
sinf
(
phase
)));
phase
=
apos
.
z
*
kz
;
structureFactor
=
MultofFloat2
(
structureFactor
,
make_float2
(
cosf
(
phase
),
sinf
(
phase
)));
sum
.
x
+=
apos
.
w
*
structureFactor
.
x
;
sum
.
y
+=
apos
.
w
*
structureFactor
.
y
;
}
cSim
.
pEwaldCosSinSum
[
index
]
=
sum
;
// Compute the contribution to the energy.
float
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
float
ak
=
exp
(
k2
*
cSim
.
factorEwald
)
/
k2
;
energy
+=
recipCoeff
*
ak
*
(
sum
.
x
*
sum
.
x
+
sum
.
y
*
sum
.
y
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
/**
* Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
* previous routine.
*/
__global__
void
kCalculateEwaldFastForces_kernel
()
{
const
float
epsilon
=
1
.
0
;
float
recipCoeff
=
cSim
.
epsfac
*
(
4
*
LOCAL_HACK_PI
/
cSim
.
cellVolume
/
epsilon
);
unsigned
int
atom
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
atom
<
cSim
.
atoms
)
{
float4
force
=
cSim
.
pForce4
[
atom
];
float4
apos
=
cSim
.
pPosq
[
atom
];
// Loop over all wave vectors.
int
lowry
=
0
;
int
lowrz
=
1
;
for
(
int
rx
=
0
;
rx
<
cSim
.
kmaxX
;
rx
++
)
{
float
kx
=
rx
*
cSim
.
recipBoxSizeX
;
for
(
int
ry
=
lowry
;
ry
<
cSim
.
kmaxY
;
ry
++
)
{
float
ky
=
ry
*
cSim
.
recipBoxSizeY
;
float
phase
=
apos
.
x
*
kx
;
float2
tab_xy
=
make_float2
(
cosf
(
phase
),
sinf
(
phase
));
phase
=
apos
.
y
*
ky
;
tab_xy
=
MultofFloat2
(
tab_xy
,
make_float2
(
cosf
(
phase
),
sinf
(
phase
)));
for
(
int
rz
=
lowrz
;
rz
<
cSim
.
kmaxZ
;
rz
++
)
{
float
kz
=
rz
*
cSim
.
recipBoxSizeZ
;
// Compute the force contribution of this wave vector.
int
index
=
rx
*
(
cSim
.
kmaxY
*
2
-
1
)
*
(
cSim
.
kmaxZ
*
2
-
1
)
+
(
ry
+
cSim
.
kmaxY
-
1
)
*
(
cSim
.
kmaxZ
*
2
-
1
)
+
(
rz
+
cSim
.
kmaxZ
-
1
);
float
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
float
ak
=
exp
(
k2
*
cSim
.
factorEwald
)
/
k2
;
phase
=
apos
.
z
*
kz
;
float2
structureFactor
=
MultofFloat2
(
tab_xy
,
make_float2
(
cosf
(
phase
),
sinf
(
phase
)));
float2
cosSinSum
=
cSim
.
pEwaldCosSinSum
[
index
];
float
dEdR
=
ak
*
apos
.
w
*
(
cosSinSum
.
x
*
structureFactor
.
y
-
cosSinSum
.
y
*
structureFactor
.
x
);
force
.
x
+=
2
*
recipCoeff
*
dEdR
*
kx
;
force
.
y
+=
2
*
recipCoeff
*
dEdR
*
ky
;
force
.
z
+=
2
*
recipCoeff
*
dEdR
*
kz
;
lowrz
=
1
-
cSim
.
kmaxZ
;
}
lowry
=
1
-
cSim
.
kmaxY
;
}
}
// Record the force on the atom.
cSim
.
pForce4
[
atom
]
=
force
;
atom
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
platforms/cuda-old/src/kernels/kCalculateCDLJEwaldReciprocal.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Rossen P. Apostolov, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evaluating nonbonded forces using the
* Ewald summation method (Reciprocal space summation).
*/
__global__
void
kCalculateCDLJEwaldReciprocalForces_kernel
()
{
const
float
eps0
=
1
.
0
f
/
(
4
.
0
f
*
3
.
1415926535
f
*
cSim
.
epsfac
);
unsigned
int
atomID1
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
atomID1
<
cSim
.
atoms
)
{
float4
apos1
=
cSim
.
pPosq
[
atomID1
];
float4
af
=
cSim
.
pForce4
[
atomID1
];
unsigned
int
atomID2
=
0
;
while
(
atomID2
<
cSim
.
atoms
)
{
float4
apos2
=
cSim
.
pPosq
[
atomID2
];
float
scale
=
2
.
0
f
*
apos1
.
w
*
apos2
.
w
/
(
cSim
.
cellVolume
*
eps0
);
int
lowry
=
0
;
int
lowrz
=
1
;
for
(
int
rx
=
0
;
rx
<
cSim
.
kmaxX
;
rx
++
)
{
float
kx
=
rx
*
cSim
.
recipBoxSizeX
;
for
(
int
ry
=
lowry
;
ry
<
cSim
.
kmaxY
;
ry
++
)
{
float
ky
=
ry
*
cSim
.
recipBoxSizeY
;
for
(
int
rz
=
lowrz
;
rz
<
cSim
.
kmaxZ
;
rz
++
)
{
float
kz
=
rz
*
cSim
.
recipBoxSizeZ
;
float
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
float
ek
=
exp
(
k2
*
cSim
.
factorEwald
);
float
arg1
=
kx
*
apos1
.
x
+
ky
*
apos1
.
y
+
kz
*
apos1
.
z
;
float
arg2
=
kx
*
apos2
.
x
+
ky
*
apos2
.
y
+
kz
*
apos2
.
z
;
float
sinI
=
sinf
(
arg1
);
float
sinJ
=
sinf
(
arg2
);
float
cosI
=
cosf
(
arg1
);
float
cosJ
=
cosf
(
arg2
);
float
f
=
scale
*
ek
*
(
-
sinI
*
cosJ
+
cosI
*
sinJ
)
/
k2
;
af
.
x
-=
kx
*
f
;
af
.
y
-=
ky
*
f
;
af
.
z
-=
kz
*
f
;
lowrz
=
1
-
cSim
.
kmaxZ
;
}
lowry
=
1
-
cSim
.
kmaxY
;
}
}
atomID2
++
;
}
cSim
.
pForce4
[
atomID1
]
=
af
;
atomID1
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
platforms/cuda-old/src/kernels/kCalculateCDLJForces.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
fx
;
float
fy
;
float
fz
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
texture
<
float
,
1
,
cudaReadModeElementType
>
tabulatedErfcRef
;
static
__device__
float
fastErfc
(
float
r
)
{
float
normalized
=
cSim
.
tabulatedErfcScale
*
r
;
int
index
=
(
int
)
normalized
;
float
fract2
=
normalized
-
index
;
float
fract1
=
1.0
f
-
fract2
;
return
fract1
*
tex1Dfetch
(
tabulatedErfcRef
,
index
)
+
fract2
*
tex1Dfetch
(
tabulatedErfcRef
,
index
+
1
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJForces.h"
// Include versions of the kernels for Ewald
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define USE_EWALD
#define METHOD_NAME(a, b) a##Ewald##b
#include "kCalculateCDLJForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##EwaldByWarp##b
#include "kCalculateCDLJForces.h"
// Reciprocal Space Ewald summation is in a separate kernel
#include "kCalculateCDLJEwaldFastReciprocal.h"
void
kCalculatePME
(
gpuContext
gpu
);
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJCutoffForces\n");
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateCDLJN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJN2Forces"
);
break
;
case
CUTOFF
:
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCDLJCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJCutoffForces"
);
break
;
case
PERIODIC
:
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJPeriodicByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCDLJPeriodicForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJPeriodicForces"
);
break
;
case
EWALD
:
case
PARTICLE_MESH_EWALD
:
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kFindInteractionsWithinBlocksPeriodic"
);
cudaChannelFormatDesc
channelDesc
=
cudaCreateChannelDesc
<
float
>
();
cudaBindTexture
(
NULL
,
&
tabulatedErfcRef
,
gpu
->
psTabulatedErfc
->
_pDevData
,
&
channelDesc
,
gpu
->
psTabulatedErfc
->
_length
*
sizeof
(
float
));
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJEwaldByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCDLJEwaldForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJEwaldForces"
);
if
(
gpu
->
sim
.
nonbondedMethod
==
EWALD
)
{
kCalculateEwaldFastCosSinSums_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateEwaldFastCosSinSums"
);
kCalculateEwaldFastForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateEwaldFastForces"
);
}
else
kCalculatePME
(
gpu
);
}
}
platforms/cuda-old/src/kernels/kCalculateCDLJForces.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for evalauating nonbonded forces. It is included
* several times in kCalculateCDLJForces.cu with different #defines to generate
* different versions of the kernels.
*/
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateCDLJ
,
Forces_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
gridDim
.
x
*
blockDim
.
x
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
float
CDLJ_energy
;
float
energy
=
0
.
0
f
;
#ifdef USE_CUTOFF
volatile
float3
*
tempBuffer
=
(
volatile
float3
*
)
&
sA
[
cSim
.
nonbond_threads_per_block
];
#endif
#ifdef USE_EWALD
const
float
TWO_OVER_SQRT_PI
=
2
.
0
f
/
sqrtf
(
LOCAL_HACK_PI
);
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
af
.
x
=
0
.
0
f
;
af
.
y
=
0
.
0
f
;
af
.
z
=
0
.
0
f
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1
.
0
f
/
sqrtf
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
/* E */
CDLJ_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float
r
=
sqrtf
(
r2
);
float
alphaR
=
cSim
.
alphaEwald
*
r
;
float
erfcAlphaR
=
fastErfc
(
alphaR
);
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
(
erfcAlphaR
+
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
erfcAlphaR
;
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#endif
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
#endif
dEdR
*=
invR
*
invR
;
#ifdef USE_CUTOFF
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0
.
0
f
;
/* E */
CDLJ_energy
=
0
.
0
f
;
}
#endif
/* E */
energy
+=
0
.
5
f
*
CDLJ_energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
}
else
// bExclusion
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1
.
0
f
/
sqrtf
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
/* E */
CDLJ_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float
r
=
sqrtf
(
r2
);
float
alphaR
=
cSim
.
alphaEwald
*
r
;
float
erfcAlphaR
=
fastErfc
(
alphaR
);
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
(
erfcAlphaR
+
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
erfcAlphaR
;
bool
needCorrection
=
!
(
excl
&
0x1
)
&&
x
+
tgx
!=
y
+
j
&&
x
+
tgx
<
cSim
.
atoms
&&
y
+
j
<
cSim
.
atoms
;
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR
=
-
apos
.
w
*
psA
[
j
].
q
*
invR
*
((
1
.
0
f
-
erfcAlphaR
)
-
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
CDLJ_energy
=
-
apos
.
w
*
psA
[
j
].
q
*
invR
*
(
1
.
0
f
-
erfcAlphaR
);
}
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#endif
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
#endif
dEdR
*=
invR
*
invR
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
if
(
!
needCorrection
&&
(
!
(
excl
&
0x1
)
||
r2
>
cSim
.
nonbondedCutoffSqr
))
#else
if
(
!
(
excl
&
0x1
)
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#endif
#else
if
(
!
(
excl
&
0x1
))
#endif
{
dEdR
=
0
.
0
f
;
/* E */
CDLJ_energy
=
0
.
0
f
;
}
/* E */
energy
+=
0
.
5
f
*
CDLJ_energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
float4
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
}
sA
[
threadIdx
.
x
].
fx
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fy
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fz
=
0
.
0
f
;
apos
.
w
*=
cSim
.
epsfac
;
if
(
!
bExclusionFlag
)
{
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1
.
0
f
/
sqrtf
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
/* E */
CDLJ_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float
r
=
sqrtf
(
r2
);
float
alphaR
=
cSim
.
alphaEwald
*
r
;
float
erfcAlphaR
=
fastErfc
(
alphaR
);
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
*
(
erfcAlphaR
+
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
*
erfcAlphaR
;
#else
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#endif
#else
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
#endif
dEdR
*=
invR
*
invR
;
#ifdef USE_CUTOFF
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0
.
0
f
;
/* E */
CDLJ_energy
=
0
.
0
f
;
}
#endif
/* E */
energy
+=
CDLJ_energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1
.
0
f
/
sqrtf
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
/* E */
CDLJ_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float
r
=
sqrtf
(
r2
);
float
alphaR
=
cSim
.
alphaEwald
*
r
;
float
erfcAlphaR
=
fastErfc
(
alphaR
);
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
(
erfcAlphaR
+
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
*
erfcAlphaR
;
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#endif
#else
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
#endif
dEdR
*=
invR
*
invR
;
#ifdef USE_CUTOFF
if
(
r2
>
cSim
.
nonbondedCutoffSqr
)
{
dEdR
=
0
.
0
f
;
/* E */
CDLJ_energy
=
0
.
0
f
;
}
#endif
/* E */
energy
+=
CDLJ_energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
tempBuffer
[
threadIdx
.
x
].
x
=
dx
;
tempBuffer
[
threadIdx
.
x
].
y
=
dy
;
tempBuffer
[
threadIdx
.
x
].
z
=
dz
;
// Sum the forces on atom j.
if
(
tgx
%
2
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
1
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
1
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
1
].
z
;
}
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
2
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
2
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
2
].
z
;
}
if
(
tgx
%
8
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
4
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
4
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
4
].
z
;
}
if
(
tgx
%
16
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
8
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
8
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
8
].
z
;
}
if
(
tgx
==
0
)
{
psA
[
j
].
fx
+=
tempBuffer
[
threadIdx
.
x
].
x
+
tempBuffer
[
threadIdx
.
x
+
16
].
x
;
psA
[
j
].
fy
+=
tempBuffer
[
threadIdx
.
x
].
y
+
tempBuffer
[
threadIdx
.
x
+
16
].
y
;
psA
[
j
].
fz
+=
tempBuffer
[
threadIdx
.
x
].
z
+
tempBuffer
[
threadIdx
.
x
+
16
].
z
;
}
}
}
}
#endif
}
else
// bExclusion
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1
.
0
f
/
sqrtf
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
/* E */
CDLJ_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float
r
=
sqrtf
(
r2
);
float
alphaR
=
cSim
.
alphaEwald
*
r
;
float
erfcAlphaR
=
fastErfc
(
alphaR
);
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
*
(
erfcAlphaR
+
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
*
erfcAlphaR
;
bool
needCorrection
=
!
(
excl
&
0x1
)
&&
x
+
tgx
!=
y
+
tj
&&
x
+
tgx
<
cSim
.
atoms
&&
y
+
tj
<
cSim
.
atoms
;
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
dEdR
=
-
apos
.
w
*
psA
[
tj
].
q
*
invR
*
((
1
.
0
f
-
erfcAlphaR
)
-
alphaR
*
exp
(
-
alphaR
*
alphaR
)
*
TWO_OVER_SQRT_PI
);
CDLJ_energy
=
-
apos
.
w
*
psA
[
tj
].
q
*
invR
*
(
1
.
0
f
-
erfcAlphaR
);
}
#else
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#endif
#else
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
/* E */
CDLJ_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
#endif
dEdR
*=
invR
*
invR
;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
if
(
!
needCorrection
&&
(
!
(
excl
&
0x1
)
||
r2
>
cSim
.
nonbondedCutoffSqr
))
#else
if
(
!
(
excl
&
0x1
)
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#endif
#else
if
(
!
(
excl
&
0x1
))
#endif
{
dEdR
=
0
.
0
f
;
/* E */
CDLJ_energy
=
0
.
0
f
;
}
/* E */
energy
+=
CDLJ_energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
#else
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
lasty
=
y
;
}
pos
++
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernel for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
// Include versions of the kernels for Ewald
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define USE_EWALD
#define METHOD_NAME(a, b) a##Ewald##b
#include "kCalculateCDLJObcGbsaForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##EwaldByWarp##b
#include "kCalculateCDLJObcGbsaForces1.h"
extern
__global__
void
kFindBlockBoundsCutoff_kernel
();
extern
__global__
void
kFindBlockBoundsPeriodic_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsCutoff_kernel
();
extern
__global__
void
kFindBlocksWithInteractionsPeriodic_kernel
();
extern
__global__
void
kFindInteractionsWithinBlocksCutoff_kernel
(
unsigned
int
*
);
extern
__global__
void
kFindInteractionsWithinBlocksPeriodic_kernel
(
unsigned
int
*
);
extern
__global__
void
kCalculateEwaldFastCosSinSums_kernel
();
extern
__global__
void
kCalculateEwaldFastForces_kernel
();
extern
void
kCalculatePME
(
gpuContext
gpu
);
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJObcGbsaForces1\n");
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bRecalculateBornRadii
)
{
if
(
gpu
->
bIncludeGBVI
){
kCalculateGBVIBornSum
(
gpu
);
kReduceGBVIBornSum
(
gpu
);
}
else
{
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
}
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaN2ByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateCDLJObcGbsaN2Forces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJObcGbsaN2Forces1"
);
break
;
case
CUTOFF
:
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bRecalculateBornRadii
)
{
if
(
gpu
->
bIncludeGBVI
){
kCalculateGBVIBornSum
(
gpu
);
kReduceGBVIBornSum
(
gpu
);
}
else
{
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
}
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCDLJObcGbsaCutoffForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJObcGbsaCutoffForces1"
);
break
;
case
PERIODIC
:
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bRecalculateBornRadii
)
{
if
(
gpu
->
bIncludeGBVI
){
kCalculateGBVIBornSum
(
gpu
);
kReduceGBVIBornSum
(
gpu
);
}
else
{
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
}
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCDLJObcGbsaPeriodicForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCDLJObcGbsaPeriodicForces1"
);
break
;
}
}
platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
* It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateCDLJObcGbsa
,
Forces1_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
cSim
.
nonbond_blocks
*
cSim
.
nonbond_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
float
CDLJObcGbsa_energy
;
float
energy
=
0
.
0
f
;
#ifdef USE_CUTOFF
volatile
float
*
tempBuffer
=
(
volatile
float
*
)
&
sA
[
cSim
.
nonbond_threads_per_block
];
#endif
unsigned
int
lasty
=
-
0xFFFFFFFF
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
float4
af
;
af
.
x
=
0
.
0
f
;
af
.
y
=
0
.
0
f
;
af
.
z
=
0
.
0
f
;
af
.
w
=
0
.
0
f
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1
.
0
f
/
sqrtf
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
CDLJObcGbsa_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
CDLJObcGbsa_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#else
float
factorX
=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
+=
factorX
;
CDLJObcGbsa_energy
+=
factorX
;
#endif
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4
.
0
f
*
alpha2_ij
);
float
expTerm
=
expf
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrtf
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0
.
5
f
*
Gpol
*
expTerm
*
(
1
.
0
f
+
D_ij
);
dEdR
+=
Gpol
*
(
1
.
0
f
-
0
.
25
f
*
expTerm
);
CDLJObcGbsa_energy
+=
(
q2
*
psA
[
j
].
q
)
/
denominator
;
#ifdef USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
(
x
+
j
)
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
(
x
+
j
)
>=
cSim
.
atoms
)
#endif
{
dEdR
=
0
.
0
f
;
dGpol_dalpha2_ij
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
energy
+=
0
.
5
f
*
CDLJObcGbsa_energy
;
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
}
}
else
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1
.
0
f
/
sqrtf
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
CDLJObcGbsa_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
CDLJObcGbsa_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#else
float
factorX
=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
+=
factorX
;
CDLJObcGbsa_energy
+=
factorX
;
#endif
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4
.
0
f
*
alpha2_ij
);
float
expTerm
=
expf
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrtf
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0
.
5
f
*
Gpol
*
expTerm
*
(
1
.
0
f
+
D_ij
);
dEdR
+=
Gpol
*
(
1
.
0
f
-
0
.
25
f
*
expTerm
);
CDLJObcGbsa_energy
+=
(
q2
*
psA
[
j
].
q
)
/
denominator
;
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
)
#endif
{
dEdR
=
0
.
0
f
;
dGpol_dalpha2_ij
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
energy
+=
0
.
5
f
*
CDLJObcGbsa_energy
;
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
excl
>>=
1
;
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
float4
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
of
.
w
+=
af
.
w
;
cSim
.
pForce4
[
offset
]
=
of
;
cSim
.
pBornForce
[
offset
]
=
of
.
w
;
}
else
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
}
sA
[
threadIdx
.
x
].
fx
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fy
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fz
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fb
=
0
.
0
f
;
float
q2
=
apos
.
w
*
cSim
.
preFactor
;
apos
.
w
*=
cSim
.
epsfac
;
if
(
!
bExclusionFlag
)
{
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1
.
0
f
/
sqrtf
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
CDLJObcGbsa_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
CDLJObcGbsa_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#else
float
factorX
=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
+=
factorX
;
CDLJObcGbsa_energy
+=
factorX
;
#endif
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4
.
0
f
*
alpha2_ij
);
float
expTerm
=
expf
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrtf
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0
.
5
f
*
Gpol
*
expTerm
*
(
1
.
0
f
+
D_ij
);
dEdR
+=
Gpol
*
(
1
.
0
f
-
0
.
25
f
*
expTerm
);
CDLJObcGbsa_energy
+=
(
q2
*
psA
[
tj
].
q
)
/
denominator
;
#ifdef USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
(
y
+
tj
)
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
(
y
+
tj
)
>=
cSim
.
atoms
)
#endif
{
dEdR
=
0
.
0
f
;
dGpol_dalpha2_ij
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
energy
+=
CDLJObcGbsa_energy
;
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1
.
0
f
/
sqrtf
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
CDLJObcGbsa_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
CDLJObcGbsa_energy
+=
apos
.
w
*
psA
[
j
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#else
float
factorX
=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
+=
factorX
;
CDLJObcGbsa_energy
+=
factorX
;
#endif
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4
.
0
f
*
alpha2_ij
);
float
expTerm
=
expf
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrtf
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0
.
5
f
*
Gpol
*
expTerm
*
(
1
.
0
f
+
D_ij
);
dEdR
+=
Gpol
*
(
1
.
0
f
-
0
.
25
f
*
expTerm
);
CDLJObcGbsa_energy
+=
(
q2
*
psA
[
j
].
q
)
/
denominator
;
#ifdef USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
(
y
+
j
)
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
(
y
+
j
)
>=
cSim
.
atoms
)
#endif
{
dEdR
=
0
.
0
f
;
dGpol_dalpha2_ij
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
energy
+=
CDLJObcGbsa_energy
;
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
tempBuffer
[
threadIdx
.
x
]
=
dx
;
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
fx
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
tempBuffer
[
threadIdx
.
x
]
=
dy
;
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
fy
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
tempBuffer
[
threadIdx
.
x
]
=
dz
;
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
fz
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
// Sum the Born forces.
tempBuffer
[
threadIdx
.
x
]
=
dGpol_dalpha2_ij
*
br
;
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
fb
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
}
}
}
#endif
}
else
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1
.
0
f
/
sqrtf
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12
.
0
f
*
sig6
-
6
.
0
f
)
*
sig6
;
CDLJObcGbsa_energy
=
eps
*
(
sig6
-
1
.
0
f
)
*
sig6
;
#ifdef USE_CUTOFF
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
-
2
.
0
f
*
cSim
.
reactionFieldK
*
r2
);
CDLJObcGbsa_energy
+=
apos
.
w
*
psA
[
tj
].
q
*
(
invR
+
cSim
.
reactionFieldK
*
r2
-
cSim
.
reactionFieldC
);
#else
float
factorX
=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
+=
factorX
;
CDLJObcGbsa_energy
+=
factorX
;
#endif
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4
.
0
f
*
alpha2_ij
);
float
expTerm
=
expf
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrtf
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0
.
5
f
*
Gpol
*
expTerm
*
(
1
.
0
f
+
D_ij
);
dEdR
+=
Gpol
*
(
1
.
0
f
-
0
.
25
f
*
expTerm
);
CDLJObcGbsa_energy
+=
(
q2
*
psA
[
tj
].
q
)
/
denominator
;
#ifdef USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
)
#endif
{
dEdR
=
0
.
0
f
;
dGpol_dalpha2_ij
=
0
.
0
f
;
CDLJObcGbsa_energy
=
0
.
0
f
;
}
energy
+=
CDLJObcGbsa_energy
;
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
excl
>>=
1
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
float4
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
of
.
w
+=
af
.
w
;
cSim
.
pForce4
[
offset
]
=
of
;
cSim
.
pBornForce
[
offset
]
=
of
.
w
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
#else
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
sA
[
threadIdx
.
x
].
fz
;
of
.
w
+=
sA
[
threadIdx
.
x
].
fb
;
cSim
.
pForce4
[
offset
]
=
of
;
cSim
.
pBornForce
[
offset
]
=
of
.
w
;
lasty
=
y
;
}
pos
++
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda-old/src/kernels/kCalculateCMAPTorsionForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrtf(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
if (dp > 0.99f || dp < -0.99f) { \
float3 cross = CROSS_PRODUCT(v1, v2); \
float scale = DOT3(v1, v1)*DOT3(v2, v2); \
angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
if (dp < 0.0f) \
angle = LOCAL_HACK_PI-angle; \
} \
else { \
angle = acosf(dp); \
} \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
cp0 = CROSS_PRODUCT(vector1, vector2); \
cp1 = CROSS_PRODUCT(vector2, vector3); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#endif
void
kCalculateCMAPTorsionForces_kernel
(
int
numAtoms
,
int
numTorsions
,
float4
*
forceBuffers
,
float
*
energyBuffer
,
float4
*
posq
,
float4
*
coeff
,
int2
*
mapPositions
,
int4
*
indices
,
int
*
maps
)
{
const
float
PI
=
3.14159265358979323846
f
;
float
energy
=
0.0
f
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numTorsions
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
atoms1
=
indices
[
4
*
index
];
int4
atoms2
=
indices
[
4
*
index
+
1
];
int4
atoms3
=
indices
[
4
*
index
+
2
];
int4
atoms4
=
indices
[
4
*
index
+
3
];
float4
a1
=
posq
[
atoms1
.
x
];
float4
a2
=
posq
[
atoms1
.
y
];
float4
a3
=
posq
[
atoms1
.
z
];
float4
a4
=
posq
[
atoms1
.
w
];
float4
b1
=
posq
[
atoms2
.
x
];
float4
b2
=
posq
[
atoms2
.
y
];
float4
b3
=
posq
[
atoms2
.
z
];
float4
b4
=
posq
[
atoms2
.
w
];
// Compute the first angle.
float3
v0a
=
make_float3
(
a1
.
x
-
a2
.
x
,
a1
.
y
-
a2
.
y
,
a1
.
z
-
a2
.
z
);
float3
v1a
=
make_float3
(
a3
.
x
-
a2
.
x
,
a3
.
y
-
a2
.
y
,
a3
.
z
-
a2
.
z
);
float3
v2a
=
make_float3
(
a3
.
x
-
a4
.
x
,
a3
.
y
-
a4
.
y
,
a3
.
z
-
a4
.
z
);
float3
cp0a
,
cp1a
;
float
angleA
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
v0a
,
v1a
,
v2a
,
v0a
,
cp0a
,
cp1a
,
angleA
);
angleA
=
fmod
(
angleA
+
2.0
f
*
PI
,
2.0
f
*
PI
);
// Compute the second angle.
float3
v0b
=
make_float3
(
b1
.
x
-
b2
.
x
,
b1
.
y
-
b2
.
y
,
b1
.
z
-
b2
.
z
);
float3
v1b
=
make_float3
(
b3
.
x
-
b2
.
x
,
b3
.
y
-
b2
.
y
,
b3
.
z
-
b2
.
z
);
float3
v2b
=
make_float3
(
b3
.
x
-
b4
.
x
,
b3
.
y
-
b4
.
y
,
b3
.
z
-
b4
.
z
);
float3
cp0b
,
cp1b
;
float
angleB
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
v0b
,
v1b
,
v2b
,
v0b
,
cp0b
,
cp1b
,
angleB
);
angleB
=
fmod
(
angleB
+
2.0
f
*
PI
,
2.0
f
*
PI
);
// Identify which patch this is in.
int2
pos
=
mapPositions
[
maps
[
index
]];
int
size
=
pos
.
y
;
float
delta
=
2.0
f
*
PI
/
size
;
int
s
=
(
int
)
(
angleA
/
delta
);
int
t
=
(
int
)
(
angleB
/
delta
);
float4
c
[
4
];
int
coeffIndex
=
pos
.
x
+
4
*
(
s
+
size
*
t
);
c
[
0
]
=
coeff
[
coeffIndex
];
c
[
1
]
=
coeff
[
coeffIndex
+
1
];
c
[
2
]
=
coeff
[
coeffIndex
+
2
];
c
[
3
]
=
coeff
[
coeffIndex
+
3
];
float
da
=
angleA
/
delta
-
s
;
float
db
=
angleB
/
delta
-
t
;
// Evaluate the spline to determine the energy and gradients.
float
torsionEnergy
=
0.0
f
;
float
dEdA
=
0.0
f
;
float
dEdB
=
0.0
f
;
torsionEnergy
=
da
*
torsionEnergy
+
((
c
[
3
].
w
*
db
+
c
[
3
].
z
)
*
db
+
c
[
3
].
y
)
*
db
+
c
[
3
].
x
;
dEdA
=
db
*
dEdA
+
(
3.0
f
*
c
[
3
].
w
*
da
+
2.0
f
*
c
[
2
].
w
)
*
da
+
c
[
1
].
w
;
dEdB
=
da
*
dEdB
+
(
3.0
f
*
c
[
3
].
w
*
db
+
2.0
f
*
c
[
3
].
z
)
*
db
+
c
[
3
].
y
;
torsionEnergy
=
da
*
torsionEnergy
+
((
c
[
2
].
w
*
db
+
c
[
2
].
z
)
*
db
+
c
[
2
].
y
)
*
db
+
c
[
2
].
x
;
dEdA
=
db
*
dEdA
+
(
3.0
f
*
c
[
3
].
z
*
da
+
2.0
f
*
c
[
2
].
z
)
*
da
+
c
[
1
].
z
;
dEdB
=
da
*
dEdB
+
(
3.0
f
*
c
[
2
].
w
*
db
+
2.0
f
*
c
[
2
].
z
)
*
db
+
c
[
2
].
y
;
torsionEnergy
=
da
*
torsionEnergy
+
((
c
[
1
].
w
*
db
+
c
[
1
].
z
)
*
db
+
c
[
1
].
y
)
*
db
+
c
[
1
].
x
;
dEdA
=
db
*
dEdA
+
(
3.0
f
*
c
[
3
].
y
*
da
+
2.0
f
*
c
[
2
].
y
)
*
da
+
c
[
1
].
y
;
dEdB
=
da
*
dEdB
+
(
3.0
f
*
c
[
1
].
w
*
db
+
2.0
f
*
c
[
1
].
z
)
*
db
+
c
[
1
].
y
;
torsionEnergy
=
da
*
torsionEnergy
+
((
c
[
0
].
w
*
db
+
c
[
0
].
z
)
*
db
+
c
[
0
].
y
)
*
db
+
c
[
0
].
x
;
dEdA
=
db
*
dEdA
+
(
3.0
f
*
c
[
3
].
x
*
da
+
2.0
f
*
c
[
2
].
x
)
*
da
+
c
[
1
].
x
;
dEdB
=
da
*
dEdB
+
(
3.0
f
*
c
[
0
].
w
*
db
+
2.0
f
*
c
[
0
].
z
)
*
db
+
c
[
0
].
y
;
dEdA
/=
delta
;
dEdB
/=
delta
;
energy
+=
torsionEnergy
;
// Apply the force to the first torsion.
float
normCross1
=
DOT3
(
cp0a
,
cp0a
);
float
normSqrBC
=
DOT3
(
v1a
,
v1a
);
float
normBC
=
sqrtf
(
normSqrBC
);
float
normCross2
=
DOT3
(
cp1a
,
cp1a
);
float
dp
=
1.0
f
/
normSqrBC
;
float4
ff
=
make_float4
((
-
dEdA
*
normBC
)
/
normCross1
,
DOT3
(
v0a
,
v1a
)
*
dp
,
DOT3
(
v2a
,
v1a
)
*
dp
,
(
dEdA
*
normBC
)
/
normCross2
);
float3
internalF0
=
make_float3
(
ff
.
x
*
cp0a
.
x
,
ff
.
x
*
cp0a
.
y
,
ff
.
x
*
cp0a
.
z
);
float3
internalF3
=
make_float3
(
ff
.
w
*
cp1a
.
x
,
ff
.
w
*
cp1a
.
y
,
ff
.
w
*
cp1a
.
z
);
float3
d
=
make_float3
(
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
,
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
,
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
);
unsigned
int
offsetA
=
atoms1
.
x
+
atoms3
.
x
*
numAtoms
;
unsigned
int
offsetB
=
atoms1
.
y
+
atoms3
.
y
*
numAtoms
;
unsigned
int
offsetC
=
atoms1
.
z
+
atoms3
.
z
*
numAtoms
;
unsigned
int
offsetD
=
atoms1
.
w
+
atoms3
.
w
*
numAtoms
;
float4
forceA
=
forceBuffers
[
offsetA
];
float4
forceB
=
forceBuffers
[
offsetB
];
float4
forceC
=
forceBuffers
[
offsetC
];
float4
forceD
=
forceBuffers
[
offsetD
];
forceA
.
x
+=
internalF0
.
x
;
forceA
.
y
+=
internalF0
.
y
;
forceA
.
z
+=
internalF0
.
z
;
forceB
.
x
+=
d
.
x
-
internalF0
.
x
;
forceB
.
y
+=
d
.
y
-
internalF0
.
y
;
forceB
.
z
+=
d
.
z
-
internalF0
.
z
;
forceC
.
x
+=
-
d
.
x
-
internalF3
.
x
;
forceC
.
y
+=
-
d
.
y
-
internalF3
.
y
;
forceC
.
z
+=
-
d
.
z
-
internalF3
.
z
;
forceD
.
x
+=
internalF3
.
x
;
forceD
.
y
+=
internalF3
.
y
;
forceD
.
z
+=
internalF3
.
z
;
forceBuffers
[
offsetA
]
=
forceA
;
forceBuffers
[
offsetB
]
=
forceB
;
forceBuffers
[
offsetC
]
=
forceC
;
forceBuffers
[
offsetD
]
=
forceD
;
// Apply the force to the second torsion.
normCross1
=
DOT3
(
cp0b
,
cp0b
);
normSqrBC
=
DOT3
(
v1b
,
v1b
);
normBC
=
sqrtf
(
normSqrBC
);
normCross2
=
DOT3
(
cp1b
,
cp1b
);
dp
=
1.0
f
/
normSqrBC
;
ff
=
make_float4
((
-
dEdB
*
normBC
)
/
normCross1
,
DOT3
(
v0b
,
v1b
)
*
dp
,
DOT3
(
v2b
,
v1b
)
*
dp
,
(
dEdB
*
normBC
)
/
normCross2
);
internalF0
=
make_float3
(
ff
.
x
*
cp0b
.
x
,
ff
.
x
*
cp0b
.
y
,
ff
.
x
*
cp0b
.
z
);
internalF3
=
make_float3
(
ff
.
w
*
cp1b
.
x
,
ff
.
w
*
cp1b
.
y
,
ff
.
w
*
cp1b
.
z
);
d
=
make_float3
(
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
,
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
,
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
);
offsetA
=
atoms2
.
x
+
atoms4
.
x
*
numAtoms
;
offsetB
=
atoms2
.
y
+
atoms4
.
y
*
numAtoms
;
offsetC
=
atoms2
.
z
+
atoms4
.
z
*
numAtoms
;
offsetD
=
atoms2
.
w
+
atoms4
.
w
*
numAtoms
;
forceA
=
forceBuffers
[
offsetA
];
forceB
=
forceBuffers
[
offsetB
];
forceC
=
forceBuffers
[
offsetC
];
forceD
=
forceBuffers
[
offsetD
];
forceA
.
x
+=
internalF0
.
x
;
forceA
.
y
+=
internalF0
.
y
;
forceA
.
z
+=
internalF0
.
z
;
forceB
.
x
+=
d
.
x
-
internalF0
.
x
;
forceB
.
y
+=
d
.
y
-
internalF0
.
y
;
forceB
.
z
+=
d
.
z
-
internalF0
.
z
;
forceC
.
x
+=
-
d
.
x
-
internalF3
.
x
;
forceC
.
y
+=
-
d
.
y
-
internalF3
.
y
;
forceC
.
z
+=
-
d
.
z
-
internalF3
.
z
;
forceD
.
x
+=
internalF3
.
x
;
forceD
.
y
+=
internalF3
.
y
;
forceD
.
z
+=
internalF3
.
z
;
forceBuffers
[
offsetA
]
=
forceA
;
forceBuffers
[
offsetB
]
=
forceB
;
forceBuffers
[
offsetC
]
=
forceC
;
forceBuffers
[
offsetD
]
=
forceD
;
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
void
kCalculateCMAPTorsionForces
(
gpuContext
gpu
,
CUDAStream
<
float4
>&
coefficients
,
CUDAStream
<
int2
>&
mapPositions
,
CUDAStream
<
int4
>&
torsionIndices
,
CUDAStream
<
int
>&
torsionMaps
)
{
kCalculateCMAPTorsionForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
localForces_threads_per_block
>>>
(
gpu
->
sim
.
stride
,
torsionMaps
.
_length
,
gpu
->
sim
.
pForce4
,
gpu
->
sim
.
pEnergy
,
gpu
->
sim
.
pPosq
,
coefficients
.
_pDevData
,
mapPositions
.
_pDevData
,
torsionIndices
.
_pDevData
,
torsionMaps
.
_pDevData
);
LAUNCHERROR
(
"kCalculateCMAPTorsionForces"
);
}
platforms/cuda-old/src/kernels/kCalculateCustomAngleForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
Expression
<
256
>
forceExp
;
static
__constant__
Expression
<
256
>
energyExp
;
#include "kEvaluateExpression.h"
void
SetCalculateCustomAngleForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCustomAngleForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
void
SetCustomAngleForceExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
forceExp
,
&
expression
,
sizeof
(
forceExp
));
RTERROR
(
status
,
"SetCustomAngleForceExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomAngleEnergyExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
energyExp
,
&
expression
,
sizeof
(
energyExp
));
RTERROR
(
status
,
"SetCustomAngleEnergyExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomAngleGlobalParams
(
const
vector
<
float
>&
paramValues
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
globalParams
,
&
paramValues
[
0
],
paramValues
.
size
()
*
sizeof
(
float
));
RTERROR
(
status
,
"SetCustomAngleGlobalParams: cudaMemcpyToSymbol failed"
);
}
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kCalculateCustomAngleForces_kernel
()
{
extern
__shared__
float
stack
[];
float
*
variables
=
(
float
*
)
&
stack
[
cSim
.
customExpressionStackSize
*
blockDim
.
x
];
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
totalEnergy
=
0.0
f
;
while
(
pos
<
cSim
.
customAngles
)
{
int4
atom
=
cSim
.
pCustomAngleID1
[
pos
];
int2
atom2
=
cSim
.
pCustomAngleID2
[
pos
];
float4
params
=
cSim
.
pCustomAngleParams
[
pos
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float4
a3
=
cSim
.
pPosq
[
atom
.
z
];
float3
v0
=
make_float3
(
a2
.
x
-
a1
.
x
,
a2
.
y
-
a1
.
y
,
a2
.
z
-
a1
.
z
);
float3
v1
=
make_float3
(
a2
.
x
-
a3
.
x
,
a2
.
y
-
a3
.
y
,
a2
.
z
-
a3
.
z
);
float3
cp
=
CROSS_PRODUCT
(
v0
,
v1
);
float
rp
=
DOT3
(
cp
,
cp
);
rp
=
max
(
sqrtf
(
rp
),
1.0e-06
f
);
float
r21
=
DOT3
(
v0
,
v0
);
float
r23
=
DOT3
(
v1
,
v1
);
float
dot
=
DOT3
(
v0
,
v1
);
float
cosine
=
max
(
-
1.0
f
,
min
(
1.0
f
,
dot
/
sqrtf
(
r21
*
r23
)));
VARIABLE
(
0
)
=
acosf
(
cosine
);
VARIABLE
(
1
)
=
params
.
x
;
VARIABLE
(
2
)
=
params
.
y
;
VARIABLE
(
3
)
=
params
.
z
;
VARIABLE
(
4
)
=
params
.
w
;
float
dEdR
=
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
);
totalEnergy
+=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
float
termA
=
dEdR
/
(
r21
*
rp
);
float
termC
=
-
dEdR
/
(
r23
*
rp
);
float3
c21
=
CROSS_PRODUCT
(
v0
,
cp
);
float3
c23
=
CROSS_PRODUCT
(
v1
,
cp
);
c21
.
x
*=
termA
;
c21
.
y
*=
termA
;
c21
.
z
*=
termA
;
c23
.
x
*=
termC
;
c23
.
y
*=
termC
;
c23
.
z
*=
termC
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
w
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom2
.
x
*
cSim
.
stride
;
unsigned
int
offsetC
=
atom
.
z
+
atom2
.
y
*
cSim
.
stride
;
float4
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
cSim
.
pForce4
[
offsetB
];
float4
forceC
=
cSim
.
pForce4
[
offsetC
];
forceA
.
x
+=
c21
.
x
;
forceA
.
y
+=
c21
.
y
;
forceA
.
z
+=
c21
.
z
;
forceB
.
x
-=
c21
.
x
+
c23
.
x
;
forceB
.
y
-=
c21
.
y
+
c23
.
y
;
forceB
.
z
-=
c21
.
z
+
c23
.
z
;
forceC
.
x
+=
c23
.
x
;
forceC
.
y
+=
c23
.
y
;
forceC
.
z
+=
c23
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
cSim
.
pForce4
[
offsetC
]
=
forceC
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
void
kCalculateCustomAngleForces
(
gpuContext
gpu
)
{
// printf("kCalculateCustomAngleForces\n");
int
memoryPerThread
=
(
gpu
->
sim
.
customExpressionStackSize
+
9
)
*
sizeof
(
float
);
int
maxThreads
=
(
gpu
->
sharedMemoryPerBlock
-
16
)
/
memoryPerThread
;
int
threads
=
min
(
gpu
->
sim
.
localForces_threads_per_block
,
(
maxThreads
/
64
)
*
64
);
kCalculateCustomAngleForces_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
memoryPerThread
*
threads
>>>
();
LAUNCHERROR
(
"kCalculateCustomAngleForces"
);
}
platforms/cuda-old/src/kernels/kCalculateCustomBondForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
Expression
<
256
>
forceExp
;
static
__constant__
Expression
<
256
>
energyExp
;
#include "kEvaluateExpression.h"
void
SetCalculateCustomBondForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCustomBondForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
void
SetCustomBondForceExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
forceExp
,
&
expression
,
sizeof
(
forceExp
));
RTERROR
(
status
,
"SetCustomBondForceExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomBondEnergyExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
energyExp
,
&
expression
,
sizeof
(
energyExp
));
RTERROR
(
status
,
"SetCustomBondEnergyExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomBondGlobalParams
(
const
vector
<
float
>&
paramValues
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
globalParams
,
&
paramValues
[
0
],
paramValues
.
size
()
*
sizeof
(
float
));
RTERROR
(
status
,
"SetCustomBondGlobalParams: cudaMemcpyToSymbol failed"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kCalculateCustomBondForces_kernel
()
{
extern
__shared__
float
stack
[];
float
*
variables
=
(
float
*
)
&
stack
[
cSim
.
customExpressionStackSize
*
blockDim
.
x
];
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
totalEnergy
=
0.0
f
;
while
(
pos
<
cSim
.
customBonds
)
{
int4
atom
=
cSim
.
pCustomBondID
[
pos
];
float4
params
=
cSim
.
pCustomBondParams
[
pos
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float
dx
=
a1
.
x
-
a2
.
x
;
float
dy
=
a1
.
y
-
a2
.
y
;
float
dz
=
a1
.
z
-
a2
.
z
;
float
r
=
sqrtf
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
float
invR
=
1.0
f
/
r
;
VARIABLE
(
0
)
=
r
;
VARIABLE
(
1
)
=
params
.
x
;
VARIABLE
(
2
)
=
params
.
y
;
VARIABLE
(
3
)
=
params
.
z
;
VARIABLE
(
4
)
=
params
.
w
;
float
dEdR
=
-
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
)
*
invR
;
float
energy
=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
totalEnergy
+=
energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
cSim
.
pForce4
[
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
void
kCalculateCustomBondForces
(
gpuContext
gpu
)
{
// printf("kCalculateCustomBondForces\n");
int
memoryPerThread
=
(
gpu
->
sim
.
customExpressionStackSize
+
9
)
*
sizeof
(
float
);
int
maxThreads
=
(
gpu
->
sharedMemoryPerBlock
-
16
)
/
memoryPerThread
;
int
threads
=
min
(
gpu
->
sim
.
localForces_threads_per_block
,
(
maxThreads
/
64
)
*
64
);
kCalculateCustomBondForces_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
memoryPerThread
*
threads
>>>
();
LAUNCHERROR
(
"kCalculateCustomBondForces"
);
}
platforms/cuda-old/src/kernels/kCalculateCustomExternalForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
Expression
<
256
>
forceExpX
;
static
__constant__
Expression
<
256
>
forceExpY
;
static
__constant__
Expression
<
256
>
forceExpZ
;
static
__constant__
Expression
<
256
>
energyExp
;
#include "kEvaluateExpression.h"
void
SetCalculateCustomExternalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCustomExternalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
void
SetCustomExternalForceExpressions
(
const
Expression
<
256
>&
expressionX
,
const
Expression
<
256
>&
expressionY
,
const
Expression
<
256
>&
expressionZ
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
forceExpX
,
&
expressionX
,
sizeof
(
forceExpX
));
status
=
cudaMemcpyToSymbol
(
forceExpY
,
&
expressionY
,
sizeof
(
forceExpY
));
status
=
cudaMemcpyToSymbol
(
forceExpZ
,
&
expressionZ
,
sizeof
(
forceExpZ
));
RTERROR
(
status
,
"SetCustomExternalForceExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomExternalEnergyExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
energyExp
,
&
expression
,
sizeof
(
energyExp
));
RTERROR
(
status
,
"SetCustomExternalEnergyExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomExternalGlobalParams
(
const
vector
<
float
>&
paramValues
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
globalParams
,
&
paramValues
[
0
],
paramValues
.
size
()
*
sizeof
(
float
));
RTERROR
(
status
,
"SetCustomExternalGlobalParams: cudaMemcpyToSymbol failed"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kCalculateCustomExternalForces_kernel
()
{
extern
__shared__
float
stack
[];
float
*
variables
=
(
float
*
)
&
stack
[
cSim
.
customExpressionStackSize
*
blockDim
.
x
];
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
totalEnergy
=
0.0
f
;
while
(
index
<
cSim
.
customExternals
)
{
int
atom
=
cSim
.
pCustomExternalID
[
index
];
float4
params
=
cSim
.
pCustomExternalParams
[
index
];
float4
pos
=
cSim
.
pPosq
[
atom
];
VARIABLE
(
0
)
=
pos
.
x
;
VARIABLE
(
1
)
=
pos
.
y
;
VARIABLE
(
2
)
=
pos
.
z
;
VARIABLE
(
3
)
=
params
.
x
;
VARIABLE
(
4
)
=
params
.
y
;
VARIABLE
(
5
)
=
params
.
z
;
VARIABLE
(
6
)
=
params
.
w
;
totalEnergy
+=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);;
float4
force
=
cSim
.
pForce4
[
atom
];
force
.
x
-=
kEvaluateExpression_kernel
(
&
forceExpX
,
stack
,
variables
);
force
.
y
-=
kEvaluateExpression_kernel
(
&
forceExpY
,
stack
,
variables
);
force
.
z
-=
kEvaluateExpression_kernel
(
&
forceExpZ
,
stack
,
variables
);
cSim
.
pForce4
[
atom
]
=
force
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
void
kCalculateCustomExternalForces
(
gpuContext
gpu
)
{
// printf("kCalculateCustomExternalForces\n");
int
memoryPerThread
=
(
gpu
->
sim
.
customExpressionStackSize
+
9
)
*
sizeof
(
float
);
int
maxThreads
=
(
gpu
->
sharedMemoryPerBlock
-
16
)
/
memoryPerThread
;
int
threads
=
min
(
gpu
->
sim
.
localForces_threads_per_block
,
(
maxThreads
/
64
)
*
64
);
kCalculateCustomExternalForces_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
memoryPerThread
*
threads
>>>
();
LAUNCHERROR
(
"kCalculateCustomExternalForces"
);
}
platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float4
params
;
float
fx
;
float
fy
;
float
fz
;
};
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
Expression
<
256
>
forceExp
;
static
__constant__
Expression
<
256
>
energyExp
;
#include "kEvaluateExpression.h"
void
SetCalculateCustomNonbondedForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCustomNonbondedForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
void
SetCustomNonbondedForceExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
forceExp
,
&
expression
,
sizeof
(
forceExp
));
RTERROR
(
status
,
"SetCustomNonbondedForceExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomNonbondedEnergyExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
energyExp
,
&
expression
,
sizeof
(
energyExp
));
RTERROR
(
status
,
"SetCustomNonbondedEnergyExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomNonbondedGlobalParams
(
const
vector
<
float
>&
paramValues
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
globalParams
,
&
paramValues
[
0
],
paramValues
.
size
()
*
sizeof
(
float
));
RTERROR
(
status
,
"SetCustomNonbondedGlobalParams: cudaMemcpyToSymbol failed"
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCustomNonbondedForces.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCustomNonbondedForces.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCustomNonbondedForces.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCustomNonbondedForces.h"
__global__
void
kFindBlockBoundsCutoff_kernel
();
__global__
void
kFindBlocksWithInteractionsCutoff_kernel
();
__global__
void
kFindInteractionsWithinBlocksCutoff_kernel
(
unsigned
int
*
workUnit
);
__global__
void
kFindBlockBoundsPeriodic_kernel
();
__global__
void
kFindBlocksWithInteractionsPeriodic_kernel
();
__global__
void
kFindInteractionsWithinBlocksPeriodic_kernel
(
unsigned
int
*
workUnit
);
void
kCalculateCustomNonbondedForces
(
gpuContext
gpu
,
bool
neighborListValid
)
{
// printf("kCalculateCustomNonbondedCutoffForces\n");
if
(
gpu
->
tabulatedFunctionsChanged
)
{
cudaChannelFormatDesc
channelDesc
=
cudaCreateChannelDesc
<
float4
>
();
if
(
gpu
->
tabulatedFunctions
[
0
].
coefficients
!=
NULL
)
cudaBindTexture
(
NULL
,
&
texRef0
,
gpu
->
tabulatedFunctions
[
0
].
coefficients
->
_pDevData
,
&
channelDesc
,
gpu
->
tabulatedFunctions
[
0
].
coefficients
->
_length
*
sizeof
(
float4
));
if
(
gpu
->
tabulatedFunctions
[
1
].
coefficients
!=
NULL
)
cudaBindTexture
(
NULL
,
&
texRef1
,
gpu
->
tabulatedFunctions
[
1
].
coefficients
->
_pDevData
,
&
channelDesc
,
gpu
->
tabulatedFunctions
[
1
].
coefficients
->
_length
*
sizeof
(
float4
));
if
(
gpu
->
tabulatedFunctions
[
2
].
coefficients
!=
NULL
)
cudaBindTexture
(
NULL
,
&
texRef2
,
gpu
->
tabulatedFunctions
[
2
].
coefficients
->
_pDevData
,
&
channelDesc
,
gpu
->
tabulatedFunctions
[
2
].
coefficients
->
_length
*
sizeof
(
float4
));
if
(
gpu
->
tabulatedFunctions
[
3
].
coefficients
!=
NULL
)
cudaBindTexture
(
NULL
,
&
texRef3
,
gpu
->
tabulatedFunctions
[
3
].
coefficients
->
_pDevData
,
&
channelDesc
,
gpu
->
tabulatedFunctions
[
3
].
coefficients
->
_length
*
sizeof
(
float4
));
gpu
->
tabulatedFunctionsChanged
=
false
;
}
int
sharedPerThread
=
sizeof
(
Atom
)
+
gpu
->
sim
.
customExpressionStackSize
*
sizeof
(
float
)
+
9
*
sizeof
(
float
);
if
(
gpu
->
sim
.
customNonbondedMethod
!=
NO_CUTOFF
)
sharedPerThread
+=
sizeof
(
float3
);
int
threads
=
gpu
->
sim
.
nonbond_threads_per_block
;
int
maxThreads
=
(
gpu
->
sharedMemoryPerBlock
-
16
)
/
sharedPerThread
;
if
(
threads
>
maxThreads
)
threads
=
(
maxThreads
/
32
)
*
32
;
switch
(
gpu
->
sim
.
customNonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCustomNonbondedN2ByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateCustomNonbondedN2Forces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pWorkUnit
);
LAUNCHERROR
(
"kCalculateCustomNonbondedN2Forces"
);
break
;
case
CUTOFF
:
if
(
!
neighborListValid
)
{
kFindBlockBoundsCutoff_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
}
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCustomNonbondedCutoffByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCustomNonbondedCutoffForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCustomNonbondedCutoffForces"
);
break
;
case
PERIODIC
:
if
(
!
neighborListValid
)
{
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
}
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateCustomNonbondedPeriodicByWarpForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateCustomNonbondedPeriodicForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
threads
,
sharedPerThread
*
threads
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kCalculateCustomNonbondedPeriodicForces"
);
break
;
}
}
platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for evalauating custom nonbonded forces. It is included
* several times in kCalculateCustomNonbondedForces.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
void
METHOD_NAME
(
kCalculateCustomNonbonded
,
Forces_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
float
stack
[];
volatile
Atom
*
sA
=
(
volatile
Atom
*
)
&
stack
[
cSim
.
customExpressionStackSize
*
blockDim
.
x
];
float
*
variables
=
(
float
*
)
&
sA
[
blockDim
.
x
];
unsigned
int
totalWarps
=
gridDim
.
x
*
blockDim
.
x
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
float
totalEnergy
=
0
.
0
f
;
#ifdef USE_CUTOFF
volatile
float3
*
tempBuffer
=
(
volatile
float3
*
)
&
variables
[
9
*
blockDim
.
x
];
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float4
params
=
cSim
.
pCustomParams
[
i
];
af
.
x
=
0
.
0
f
;
af
.
y
=
0
.
0
f
;
af
.
z
=
0
.
0
f
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
params
.
x
=
params
.
x
;
sA
[
threadIdx
.
x
].
params
.
y
=
params
.
y
;
sA
[
threadIdx
.
x
].
params
.
z
=
params
.
z
;
sA
[
threadIdx
.
x
].
params
.
w
=
params
.
w
;
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
// Record the parameters.
VARIABLE
(
0
)
=
params
.
x
;
VARIABLE
(
1
)
=
params
.
y
;
VARIABLE
(
2
)
=
params
.
z
;
VARIABLE
(
3
)
=
params
.
w
;
VARIABLE
(
4
)
=
psA
[
j
].
params
.
x
;
VARIABLE
(
5
)
=
psA
[
j
].
params
.
y
;
VARIABLE
(
6
)
=
psA
[
j
].
params
.
z
;
VARIABLE
(
7
)
=
psA
[
j
].
params
.
w
;
// Compute the force.
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r
=
sqrtf
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
float
invR
=
1
.
0
f
/
r
;
VARIABLE
(
8
)
=
r
;
float
dEdR
=
-
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
)
*
invR
;
float
energy
=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
#ifdef USE_CUTOFF
if
(
!
(
excl
&
0x1
)
||
r
>
cSim
.
nonbondedCutoff
)
#else
if
(
!
(
excl
&
0x1
))
#endif
{
dEdR
=
0
.
0
f
;
energy
=
0
.
0
f
;
}
totalEnergy
+=
0
.
5
f
*
energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
float4
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
params
.
x
=
cSim
.
pCustomParams
[
j
].
x
;
sA
[
threadIdx
.
x
].
params
.
y
=
cSim
.
pCustomParams
[
j
].
y
;
sA
[
threadIdx
.
x
].
params
.
z
=
cSim
.
pCustomParams
[
j
].
z
;
sA
[
threadIdx
.
x
].
params
.
w
=
cSim
.
pCustomParams
[
j
].
w
;
}
sA
[
threadIdx
.
x
].
fx
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fy
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fz
=
0
.
0
f
;
if
(
!
bExclusionFlag
)
{
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
// Record the parameters.
VARIABLE
(
0
)
=
params
.
x
;
VARIABLE
(
1
)
=
params
.
y
;
VARIABLE
(
2
)
=
params
.
z
;
VARIABLE
(
3
)
=
params
.
w
;
VARIABLE
(
4
)
=
psA
[
tj
].
params
.
x
;
VARIABLE
(
5
)
=
psA
[
tj
].
params
.
y
;
VARIABLE
(
6
)
=
psA
[
tj
].
params
.
z
;
VARIABLE
(
7
)
=
psA
[
tj
].
params
.
w
;
// Compute the force.
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r
=
sqrtf
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
float
invR
=
1
.
0
f
/
r
;
VARIABLE
(
8
)
=
r
;
float
dEdR
=
-
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
)
*
invR
;
float
energy
=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
#ifdef USE_CUTOFF
if
(
r
>
cSim
.
nonbondedCutoff
)
{
dEdR
=
0
.
0
f
;
energy
=
0
.
0
f
;
}
#endif
totalEnergy
+=
energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
// Record the parameters.
VARIABLE
(
0
)
=
params
.
x
;
VARIABLE
(
1
)
=
params
.
y
;
VARIABLE
(
2
)
=
params
.
z
;
VARIABLE
(
3
)
=
params
.
w
;
VARIABLE
(
4
)
=
psA
[
j
].
params
.
x
;
VARIABLE
(
5
)
=
psA
[
j
].
params
.
y
;
VARIABLE
(
6
)
=
psA
[
j
].
params
.
z
;
VARIABLE
(
7
)
=
psA
[
j
].
params
.
w
;
// Compute the force.
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r
=
sqrtf
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
float
invR
=
1
.
0
f
/
r
;
VARIABLE
(
8
)
=
r
;
float
dEdR
=
-
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
)
*
invR
;
float
energy
=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
#ifdef USE_CUTOFF
if
(
r
>
cSim
.
nonbondedCutoff
)
{
dEdR
=
0
.
0
f
;
energy
=
0
.
0
f
;
}
#endif
totalEnergy
+=
energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
tempBuffer
[
threadIdx
.
x
].
x
=
dx
;
tempBuffer
[
threadIdx
.
x
].
y
=
dy
;
tempBuffer
[
threadIdx
.
x
].
z
=
dz
;
// Sum the forces on atom j.
if
(
tgx
%
2
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
1
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
1
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
1
].
z
;
}
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
2
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
2
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
2
].
z
;
}
if
(
tgx
%
8
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
4
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
4
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
4
].
z
;
}
if
(
tgx
%
16
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
8
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
8
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
8
].
z
;
}
if
(
tgx
==
0
)
{
psA
[
j
].
fx
+=
tempBuffer
[
threadIdx
.
x
].
x
+
tempBuffer
[
threadIdx
.
x
+
16
].
x
;
psA
[
j
].
fy
+=
tempBuffer
[
threadIdx
.
x
].
y
+
tempBuffer
[
threadIdx
.
x
+
16
].
y
;
psA
[
j
].
fz
+=
tempBuffer
[
threadIdx
.
x
].
z
+
tempBuffer
[
threadIdx
.
x
+
16
].
z
;
}
}
}
}
#endif
}
else
// bExclusion
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
unsigned
int
excl
=
cSim
.
pExclusion
[
cSim
.
pExclusionIndex
[
cell
]
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
// Record the parameters.
VARIABLE
(
0
)
=
params
.
x
;
VARIABLE
(
1
)
=
params
.
y
;
VARIABLE
(
2
)
=
params
.
z
;
VARIABLE
(
3
)
=
params
.
w
;
VARIABLE
(
4
)
=
psA
[
tj
].
params
.
x
;
VARIABLE
(
5
)
=
psA
[
tj
].
params
.
y
;
VARIABLE
(
6
)
=
psA
[
tj
].
params
.
z
;
VARIABLE
(
7
)
=
psA
[
tj
].
params
.
w
;
// Compute the force.
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r
=
sqrtf
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
float
invR
=
1
.
0
f
/
r
;
VARIABLE
(
8
)
=
r
;
float
dEdR
=
-
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
)
*
invR
;
float
energy
=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
#ifdef USE_CUTOFF
if
(
!
(
excl
&
0x1
)
||
r
>
cSim
.
nonbondedCutoff
)
#else
if
(
!
(
excl
&
0x1
))
#endif
{
dEdR
=
0
.
0
f
;
energy
=
0
.
0
f
;
}
totalEnergy
+=
energy
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
#else
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
lasty
=
y
;
}
pos
++
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
platforms/cuda-old/src/kernels/kCalculateCustomTorsionForces.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
Expression
<
256
>
forceExp
;
static
__constant__
Expression
<
256
>
energyExp
;
#include "kEvaluateExpression.h"
void
SetCalculateCustomTorsionForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCustomTorsionForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
void
SetCustomTorsionForceExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
forceExp
,
&
expression
,
sizeof
(
forceExp
));
RTERROR
(
status
,
"SetCustomTorsionForceExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomTorsionEnergyExpression
(
const
Expression
<
256
>&
expression
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
energyExp
,
&
expression
,
sizeof
(
energyExp
));
RTERROR
(
status
,
"SetCustomTorsionEnergyExpression: cudaMemcpyToSymbol failed"
);
}
void
SetCustomTorsionGlobalParams
(
const
vector
<
float
>&
paramValues
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
globalParams
,
&
paramValues
[
0
],
paramValues
.
size
()
*
sizeof
(
float
));
RTERROR
(
status
,
"SetCustomTorsionGlobalParams: cudaMemcpyToSymbol failed"
);
}
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrtf(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
if (dp > 0.99f || dp < -0.99f) { \
float3 cross = CROSS_PRODUCT(v1, v2); \
float scale = DOT3(v1, v1)*DOT3(v2, v2); \
angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
if (dp < 0.0f) \
angle = LOCAL_HACK_PI-angle; \
} \
else { \
angle = acosf(dp); \
} \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
cp0 = CROSS_PRODUCT(vector1, vector2); \
cp1 = CROSS_PRODUCT(vector2, vector3); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#endif
void
kCalculateCustomTorsionForces_kernel
()
{
extern
__shared__
float
stack
[];
float
*
variables
=
(
float
*
)
&
stack
[
cSim
.
customExpressionStackSize
*
blockDim
.
x
];
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
totalEnergy
=
0.0
f
;
while
(
pos
<
cSim
.
customTorsions
)
{
int4
atom
=
cSim
.
pCustomTorsionID1
[
pos
];
int4
atom2
=
cSim
.
pCustomTorsionID2
[
pos
];
float4
params
=
cSim
.
pCustomTorsionParams
[
pos
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float4
a3
=
cSim
.
pPosq
[
atom
.
z
];
float4
a4
=
cSim
.
pPosq
[
atom
.
w
];
float3
v0
=
make_float3
(
a1
.
x
-
a2
.
x
,
a1
.
y
-
a2
.
y
,
a1
.
z
-
a2
.
z
);
float3
v1
=
make_float3
(
a3
.
x
-
a2
.
x
,
a3
.
y
-
a2
.
y
,
a3
.
z
-
a2
.
z
);
float3
v2
=
make_float3
(
a3
.
x
-
a4
.
x
,
a3
.
y
-
a4
.
y
,
a3
.
z
-
a4
.
z
);
float3
cp0
,
cp1
;
float
dihedralAngle
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
v0
,
v1
,
v2
,
v0
,
cp0
,
cp1
,
dihedralAngle
);
VARIABLE
(
0
)
=
dihedralAngle
;
VARIABLE
(
1
)
=
params
.
x
;
VARIABLE
(
2
)
=
params
.
y
;
VARIABLE
(
3
)
=
params
.
z
;
VARIABLE
(
4
)
=
params
.
w
;
float
dEdAngle
=
kEvaluateExpression_kernel
(
&
forceExp
,
stack
,
variables
);
totalEnergy
+=
kEvaluateExpression_kernel
(
&
energyExp
,
stack
,
variables
);
float
normBC
=
sqrtf
(
DOT3
(
v1
,
v1
));
float
dp
=
1.0
f
/
DOT3
(
v1
,
v1
);
float4
ff
=
make_float4
((
-
dEdAngle
*
normBC
)
/
DOT3
(
cp0
,
cp0
),
DOT3
(
v0
,
v1
)
*
dp
,
DOT3
(
v2
,
v1
)
*
dp
,
(
dEdAngle
*
normBC
)
/
DOT3
(
cp1
,
cp1
));
float3
internalF0
=
make_float3
(
ff
.
x
*
cp0
.
x
,
ff
.
x
*
cp0
.
y
,
ff
.
x
*
cp0
.
z
);
float3
internalF3
=
make_float3
(
ff
.
w
*
cp1
.
x
,
ff
.
w
*
cp1
.
y
,
ff
.
w
*
cp1
.
z
);
float3
s
=
make_float3
(
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
,
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
,
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
);
unsigned
int
offsetA
=
atom
.
x
+
atom2
.
x
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom2
.
y
*
cSim
.
stride
;
unsigned
int
offsetC
=
atom
.
z
+
atom2
.
z
*
cSim
.
stride
;
unsigned
int
offsetD
=
atom
.
w
+
atom2
.
w
*
cSim
.
stride
;
float4
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
cSim
.
pForce4
[
offsetB
];
float4
forceC
=
cSim
.
pForce4
[
offsetC
];
float4
forceD
=
cSim
.
pForce4
[
offsetD
];
forceA
.
x
+=
internalF0
.
x
;
forceA
.
y
+=
internalF0
.
y
;
forceA
.
z
+=
internalF0
.
z
;
forceB
.
x
+=
-
internalF0
.
x
+
s
.
x
;
forceB
.
y
+=
-
internalF0
.
y
+
s
.
y
;
forceB
.
z
+=
-
internalF0
.
z
+
s
.
z
;
forceC
.
x
+=
-
internalF3
.
x
-
s
.
x
;
forceC
.
y
+=
-
internalF3
.
y
-
s
.
y
;
forceC
.
z
+=
-
internalF3
.
z
-
s
.
z
;
forceD
.
x
+=
internalF3
.
x
;
forceD
.
y
+=
internalF3
.
y
;
forceD
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
cSim
.
pForce4
[
offsetC
]
=
forceC
;
cSim
.
pForce4
[
offsetD
]
=
forceD
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
void
kCalculateCustomTorsionForces
(
gpuContext
gpu
)
{
// printf("kCalculateCustomTorsionForces\n");
int
memoryPerThread
=
(
gpu
->
sim
.
customExpressionStackSize
+
9
)
*
sizeof
(
float
);
int
maxThreads
=
(
gpu
->
sharedMemoryPerBlock
-
16
)
/
memoryPerThread
;
int
threads
=
min
(
gpu
->
sim
.
localForces_threads_per_block
,
(
maxThreads
/
64
)
*
64
);
kCalculateCustomTorsionForces_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
memoryPerThread
*
threads
>>>
();
LAUNCHERROR
(
"kCalculateCustomTorsionForces"
);
}
platforms/cuda-old/src/kernels/kCalculateGBVIAux.h
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Mark Friedrichs *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifndef __GpuGBVIAUX_H__
#define __GpuGBVIAUX_H__
/**
* This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
*/
static
__device__
float
getGBVI_L
(
float
r
,
float
x
,
float
S
)
{
float
rInv
=
1
.
0
f
/
r
;
float
xInv
=
1
.
0
f
/
x
;
float
xInv2
=
xInv
*
xInv
;
float
diff2
=
(
r
+
S
)
*
(
r
-
S
);
return
(
1
.
5
f
*
xInv2
)
*
(
(
0
.
25
f
*
rInv
)
-
(
xInv
/
3
.
0
f
)
+
(
0
.
125
f
*
diff2
*
xInv2
*
rInv
)
);
}
static
__device__
float
getGBVI_Volume
(
float
r
,
float
R
,
float
S
)
{
float
addOn
=
0
.
0
f
;
int
mask
=
1
;
float
lowerBound
=
(
r
-
S
);
float
diff
=
(
S
-
R
);
if
(
fabsf
(
diff
)
<
r
){
lowerBound
=
R
>
lowerBound
?
R
:
lowerBound
;
}
else
if
(
r
<=
diff
){
addOn
=
(
1
.
0
f
/
(
R
*
R
*
R
));
}
else
{
mask
=
0
;
}
float
s2
=
getGBVI_L
(
r
,
lowerBound
,
S
);
float
s1
=
getGBVI_L
(
r
,
(
r
+
S
),
S
);
s1
=
mask
?
(
s1
-
s2
+
addOn
)
:
0
.
0
f
;
return
s1
;
}
static
__device__
float
getGBVI_dL_dr
(
float
r
,
float
x
,
float
S
)
{
float
rInv
=
1
.
0
f
/
r
;
float
rInv2
=
rInv
*
rInv
;
float
xInv
=
1
.
0
f
/
x
;
float
xInv2
=
xInv
*
xInv
;
float
xInv3
=
xInv2
*
xInv
;
float
diff2
=
(
r
+
S
)
*
(
r
-
S
);
return
(
(
-
1
.
5
f
*
xInv2
*
rInv2
)
*
(
0
.
25
f
+
0
.
125
f
*
diff2
*
xInv2
)
+
0
.
375
f
*
xInv3
*
xInv
);
}
static
__device__
float
getGBVI_dL_drNew
(
float
r
,
float
x
,
float
S
)
{
float
rInv
=
1
.
0
f
/
r
;
float
rInv2
=
rInv
*
rInv
;
float
xInv
=
1
.
0
f
/
x
;
float
xInv2
=
xInv
*
xInv
;
float
t1
=
(
S
*
rInv
);
t1
=
1
.
0
f
+
t1
*
t1
;
return
(
-
0
.
375
f
*
xInv2
)
*
(
rInv2
-
0
.
5
f
*
xInv2
*
t1
);
}
static
__device__
float
getGBVI_dL_dx
(
float
r
,
float
x
,
float
S
)
{
float
rInv
=
1
.
0
f
/
r
;
float
xInv
=
1
.
0
f
/
x
;
float
xInv2
=
xInv
*
xInv
;
float
xInv3
=
xInv2
*
xInv
;
float
diff
=
(
r
+
S
)
*
(
r
-
S
);
return
(
(
-
1
.
5
f
*
xInv3
)
*
(
(
0
.
5
f
*
rInv
)
-
xInv
+
(
0
.
5
f
*
diff
*
xInv2
*
rInv
)
));
}
static
__device__
float
getGBVI_dE2Old
(
float
r
,
float
R
,
float
S
,
float
bornForce
)
{
float
diff
=
S
-
R
;
float
absDiff
=
fabsf
(
S
-
R
);
float
dE
=
getGBVI_dL_dr
(
r
,
r
+
S
,
S
)
+
getGBVI_dL_dx
(
r
,
r
+
S
,
S
);
float
mask
;
float
lowerBound
;
if
(
(
R
>
(
r
-
S
))
&&
(
absDiff
<
r
)
){
mask
=
0
.
0
f
;
lowerBound
=
R
;
}
else
{
mask
=
1
.
0
f
;
lowerBound
=
(
r
-
S
);
}
float
dE2
=
getGBVI_dL_dr
(
r
,
lowerBound
,
S
)
+
mask
*
getGBVI_dL_dx
(
r
,
lowerBound
,
S
);
dE
-=
(
absDiff
>=
r
)
&&
r
>=
diff
?
0
.
0
f
:
dE2
;
dE
=
r
<
-
diff
?
0
.
0
f
:
dE
;
dE
*=
(
(
r
>
1.0e-08
f
)
?
(
bornForce
/
r
)
:
0
.
0
f
);
return
(
-
dE
);
}
static
__device__
float
getGBVI_dE2
(
float
r
,
float
R
,
float
S
,
float
bornForce
)
{
float
diff
=
S
-
R
;
float
dE
=
0
.
0
f
;
if
(
fabsf
(
diff
)
<
r
){
dE
=
getGBVI_dL_dr
(
r
,
r
+
S
,
S
)
+
getGBVI_dL_dx
(
r
,
r
+
S
,
S
);
float
lowerBound
;
float
mask
;
if
(
R
>
(
r
-
S
)
){
lowerBound
=
R
;
mask
=
0
.
0
f
;
}
else
{
lowerBound
=
r
-
S
;
mask
=
1
.
0
f
;
}
dE
-=
getGBVI_dL_dr
(
r
,
lowerBound
,
S
)
+
mask
*
getGBVI_dL_dx
(
r
,
lowerBound
,
S
);
}
else
if
(
r
<
(
S
-
R
)
){
dE
=
getGBVI_dL_dr
(
r
,
r
+
S
,
S
)
+
getGBVI_dL_dx
(
r
,
r
+
S
,
S
);
dE
-=
(
getGBVI_dL_dr
(
r
,
r
-
S
,
S
)
+
getGBVI_dL_dx
(
r
,
r
-
S
,
S
)
);
}
dE
*=
(
(
r
>
1.0e-08
f
)
?
(
bornForce
/
r
)
:
0
.
0
f
);
return
(
-
dE
);
}
static
__device__
float
getGBVIBornForce2
(
float
bornRadius
,
float
R
,
float
bornForce
,
float
gamma
)
{
float
ratio
=
(
R
/
bornRadius
);
float
returnBornForce
=
bornForce
+
(
3
.
0
f
*
gamma
*
ratio
*
ratio
*
ratio
)
/
bornRadius
;
// 'cavity' term
float
br2
=
bornRadius
*
bornRadius
;
returnBornForce
*=
(
1
.
0
f
/
3
.
0
f
)
*
br2
*
br2
;
return
returnBornForce
;
}
#endif // __GpuGBVIAUX_H__
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sum
;
float
gamma
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateGBVIBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateGBVIBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVIBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVIBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVIBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVIBornSum.h"
/**---------------------------------------------------------------------------------------
Compute quintic spline value and associated derviative
@param x value to compute spline at
@param rl lower cutoff value
@param ru upper cutoff value
@param outValue value of spline at x
@param outDerivative value of derivative of spline at x
--------------------------------------------------------------------------------------- */
static
__device__
void
quinticSpline_kernel
(
float
x
,
float
rl
,
float
ru
,
float
*
outValue
,
float
*
outDerivative
){
// ---------------------------------------------------------------------------------------
const
float
one
=
1.0
f
;
const
float
minusSix
=
-
6.0
f
;
const
float
minusTen
=
-
10.0
f
;
const
float
minusThirty
=
-
30.0
f
;
const
float
fifteen
=
15.0
f
;
const
float
sixty
=
60.0
f
;
// ---------------------------------------------------------------------------------------
float
numerator
=
x
-
rl
;
float
denominator
=
ru
-
rl
;
float
ratio
=
numerator
/
denominator
;
float
ratio2
=
ratio
*
ratio
;
float
ratio3
=
ratio2
*
ratio
;
*
outValue
=
one
+
ratio3
*
(
minusTen
+
fifteen
*
ratio
+
minusSix
*
ratio2
);
*
outDerivative
=
ratio2
*
(
minusThirty
+
sixty
*
ratio
+
minusThirty
*
ratio2
)
/
denominator
;
}
/**---------------------------------------------------------------------------------------
Compute Born radii based on Eq. 3 of Labute paper [JCC 29 p. 1693-1698 2008])
and quintic splice switching function
@param atomicRadius3 atomic radius cubed
@param bornSum Born sum (volume integral)
@param bornRadius output Born radius
@param switchDeriviative output switching function deriviative
--------------------------------------------------------------------------------------- */
__device__
void
computeBornRadiiUsingQuinticSpline
(
float
atomicRadius3
,
float
bornSum
,
float
*
bornRadius
,
float
*
switchDeriviative
){
// ---------------------------------------------------------------------------------------
const
float
zero
=
0.0
f
;
const
float
one
=
1.0
f
;
const
float
minusOneThird
=
(
-
1.0
f
/
3.0
f
);
// ---------------------------------------------------------------------------------------
// R = [ S(V)*(A - V) ]**(-1/3)
// S(V) = 1 V < L
// S(V) = qSpline + U/(A-V) L < V < A
// S(V) = U/(A-V) U < V
// dR/dr = (-1/3)*[ S(V)*(A - V) ]**(-4/3)*[ d{ S(V)*(A-V) }/dr
// d{ S(V)*(A-V) }/dr = (dV/dr)*[ (A-V)*dS/dV - S(V) ]
// (A - V)*dS/dV - S(V) = 0 - 1 V < L
// (A - V)*dS/dV - S(V) = (A-V)*d(qSpline) + (A-V)*U/(A-V)**2 - qSpline - U/(A-V)
// = (A-V)*d(qSpline) - qSpline L < V < A**(-3)
// (A - V)*dS/dV - S(V) = (A-V)*U*/(A-V)**2 - U/(A-V) = 0 U < V
float
splineL
=
cSim
.
gbviQuinticLowerLimitFactor
*
atomicRadius3
;
float
sum
;
if
(
bornSum
>
splineL
){
if
(
bornSum
<
atomicRadius3
){
float
splineValue
,
splineDerivative
;
quinticSpline_kernel
(
bornSum
,
splineL
,
atomicRadius3
,
&
splineValue
,
&
splineDerivative
);
sum
=
(
atomicRadius3
-
bornSum
)
*
splineValue
+
cSim
.
gbviQuinticUpperBornRadiusLimit
;
*
switchDeriviative
=
splineValue
-
(
atomicRadius3
-
bornSum
)
*
splineDerivative
;
}
else
{
sum
=
cSim
.
gbviQuinticUpperBornRadiusLimit
;
*
switchDeriviative
=
zero
;
}
}
else
{
sum
=
atomicRadius3
-
bornSum
;
*
switchDeriviative
=
one
;
}
*
bornRadius
=
powf
(
sum
,
minusOneThird
);
}
__global__
void
kReduceGBVIBornSum_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
;
float4
atom
=
cSim
.
pGBVIData
[
pos
];
// Get summed Born data
for
(
int
i
=
0
;
i
<
cSim
.
nonbondOutputBuffers
;
i
++
)
{
sum
+=
*
pSt
;
pSt
+=
cSim
.
stride
;
}
// Now calculate Born radius
float
Rinv
=
1.0
f
/
atom
.
x
;
Rinv
=
Rinv
*
Rinv
*
Rinv
;
if
(
cSim
.
gbviBornRadiusScalingMethod
==
0
){
sum
=
Rinv
-
sum
;
cSim
.
pBornRadii
[
pos
]
=
powf
(
sum
,
(
-
1.0
f
/
3.0
f
)
);
cSim
.
pGBVISwitchDerivative
[
pos
]
=
1.0
f
;
}
else
{
float
bornRadius
;
float
switchDeriviative
;
computeBornRadiiUsingQuinticSpline
(
Rinv
,
sum
,
&
bornRadius
,
&
switchDeriviative
);
cSim
.
pBornRadii
[
pos
]
=
bornRadius
;
cSim
.
pGBVISwitchDerivative
[
pos
]
=
switchDeriviative
;
}
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceGBVIBornSum
(
gpuContext
gpu
)
{
//printf("kReduceGBVIBornSum\n");
kReduceGBVIBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
gpu
->
bRecalculateBornRadii
=
false
;
LAUNCHERROR
(
"kReduceGBVIBornSum"
);
}
static
int
isNanOrInfinity
(
float
number
){
return
(
number
!=
number
||
number
==
std
::
numeric_limits
<
float
>::
infinity
()
||
number
==
-
std
::
numeric_limits
<
float
>::
infinity
())
?
1
:
0
;
}
void
kPrintGBVI
(
gpuContext
gpu
,
std
::
string
callId
,
int
call
,
FILE
*
log
)
{
gpu
->
psGBVIData
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psGBVISwitchDerivative
->
Download
();
gpu
->
psBornForce
->
Download
();
gpu
->
psPosq4
->
Download
();
gpu
->
psSigEps2
->
Download
();
int
printOnlyOnNan
=
1
;
int
foundNan
=
0
;
if
(
printOnlyOnNan
){
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
sim
.
paddedNumberOfAtoms
&&
foundNan
==
0
;
ii
++
){
foundNan
+=
isNanOrInfinity
(
gpu
->
psBornRadii
->
_pSysData
[
ii
]
);
foundNan
+=
isNanOrInfinity
(
gpu
->
psBornForce
->
_pSysData
[
ii
]
);
foundNan
+=
isNanOrInfinity
(
gpu
->
psGBVISwitchDerivative
->
_pSysData
[
ii
]
);
}
if
(
foundNan
){
log
=
stderr
;
(
void
)
fprintf
(
log
,
"kPrintGBVI found nan
\n
"
,
gpu
->
sim
.
paddedNumberOfAtoms
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
log
,
"%6d %15.7e %15.7e %15.7e
\n
"
,
ii
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
x
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
y
,
gpu
->
psPosq4
->
_pSysData
[
ii
].
z
);
}
}
}
if
(
!
printOnlyOnNan
||
foundNan
){
(
void
)
fprintf
(
log
,
"kPrintGBVI Cuda comp bR bF prm sigeps2
\n
"
);
(
void
)
fprintf
(
stderr
,
"kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u
\n
"
,
gpu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
gpu
->
sim
.
workUnits
,
gpu
->
psWorkUnit
->
_pSysStream
[
0
][
0
],
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
);
(
void
)
fprintf
(
stderr
,
"bR bF swd r scR ...
\n
"
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
log
,
"%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e
\n
"
,
ii
,
gpu
->
psBornRadii
->
_pSysData
[
ii
],
gpu
->
psBornForce
->
_pSysData
[
ii
],
gpu
->
psGBVISwitchDerivative
->
_pSysData
[
ii
],
gpu
->
psGBVIData
->
_pSysData
[
ii
].
x
,
gpu
->
psGBVIData
->
_pSysData
[
ii
].
y
,
gpu
->
psGBVIData
->
_pSysData
[
ii
].
z
,
gpu
->
psGBVIData
->
_pSysData
[
ii
].
w
,
gpu
->
psSigEps2
->
_pSysData
[
ii
].
x
,
gpu
->
psSigEps2
->
_pSysData
[
ii
].
y
);
}
if
(
foundNan
){
exit
(
0
);
}
}
}
void
kCalculateGBVIBornSum
(
gpuContext
gpu
)
{
//printf("kCalculateGBVIBornSum\n");
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateGBVIN2ByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
}
else
{
kCalculateGBVIN2BornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
}
break
;
case
CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateGBVICutoffByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateGBVICutoffBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
case
PERIODIC
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateGBVIPeriodicByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateGBVIPeriodicBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
}
LAUNCHERROR
(
"kCalculateGBVIBornSum"
);
}
platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateGBVIBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
#include "kCalculateGBVIAux.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateGBVI
,
BornSum_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
cSim
.
nonbond_blocks
*
cSim
.
nonbond_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
#ifdef USE_CUTOFF
volatile
float
*
tempBuffer
=
(
volatile
float
*
)
&
sA
[
cSim
.
nonbond_threads_per_block
];
#endif
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
// forces tgx into interval [0,31]
// forces tbx 0
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float4
ar
=
cSim
.
pGBVIData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0
.
0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#if defined USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
x
+
j
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
&&
j
!=
tgx
)
#else
if
(
i
<
cSim
.
atoms
&&
x
+
j
<
cSim
.
atoms
&&
j
!=
tgx
)
#endif
{
r
=
sqrtf
(
r2
);
apos
.
w
+=
getGBVI_Volume
(
r
,
ar
.
x
,
psA
[
j
].
sr
);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
apos
.
w
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
#endif
}
else
{
// Read fixed atom data into registers and GRF
unsigned
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float4
temp1
=
cSim
.
pGBVIData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float4
ar
=
cSim
.
pGBVIData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
0
.
0
f
;
apos
.
w
=
0
.
0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#ifdef USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
y
+
tj
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
<
cSim
.
atoms
&&
y
+
tj
<
cSim
.
atoms
)
#endif
{
r
=
sqrtf
(
r2
);
apos
.
w
+=
getGBVI_Volume
(
r
,
ar
.
x
,
psA
[
tj
].
sr
);
psA
[
tj
].
sum
+=
getGBVI_Volume
(
r
,
psA
[
tj
].
r
,
ar
.
y
);
}
tj
=
(
tj
-
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
tempBuffer
[
threadIdx
.
x
]
=
0
.
0
f
;
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#ifdef USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
y
+
j
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
<
cSim
.
atoms
&&
y
+
j
<
cSim
.
atoms
)
#endif
{
r
=
sqrtf
(
r2
);
apos
.
w
+=
getGBVI_Volume
(
r
,
ar
.
x
,
psA
[
j
].
sr
);
tempBuffer
[
threadIdx
.
x
]
=
getGBVI_Volume
(
r
,
psA
[
j
].
r
,
ar
.
y
);
}
// Sum the terms.
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
sum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
apos
.
w
;
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
sA
[
threadIdx
.
x
].
sum
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
#endif
}
pos
++
;
}
}
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment