Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2e451b9d
Commit
2e451b9d
authored
Dec 13, 2012
by
Peter Eastman
Browse files
Deleted the old CUDA platform
parent
352e2fc7
Changes
147
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
5387 deletions
+0
-5387
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.cu
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.cu
+0
-148
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.h
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.h
+0
-352
platforms/cuda-old/src/kernels/kCalculateLocalForces.cu
platforms/cuda-old/src/kernels/kCalculateLocalForces.cu
+0
-518
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.cu
+0
-205
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.h
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.h
+0
-332
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.cu
+0
-135
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.h
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.h
+0
-429
platforms/cuda-old/src/kernels/kCalculatePME.cu
platforms/cuda-old/src/kernels/kCalculatePME.cu
+0
-424
platforms/cuda-old/src/kernels/kEvaluateExpression.h
platforms/cuda-old/src/kernels/kEvaluateExpression.h
+0
-209
platforms/cuda-old/src/kernels/kFindInteractingBlocks.h
platforms/cuda-old/src/kernels/kFindInteractingBlocks.h
+0
-196
platforms/cuda-old/src/kernels/kForces.cu
platforms/cuda-old/src/kernels/kForces.cu
+0
-441
platforms/cuda-old/src/kernels/kLangevinUpdate.cu
platforms/cuda-old/src/kernels/kLangevinUpdate.cu
+0
-208
platforms/cuda-old/src/kernels/kLangevinUpdate.h
platforms/cuda-old/src/kernels/kLangevinUpdate.h
+0
-202
platforms/cuda-old/src/kernels/kMonteCarloBarostat.cu
platforms/cuda-old/src/kernels/kMonteCarloBarostat.cu
+0
-91
platforms/cuda-old/src/kernels/kRandom.cu
platforms/cuda-old/src/kernels/kRandom.cu
+0
-177
platforms/cuda-old/src/kernels/kSettle.cu
platforms/cuda-old/src/kernels/kSettle.cu
+0
-240
platforms/cuda-old/src/kernels/kShakeH.cu
platforms/cuda-old/src/kernels/kShakeH.cu
+0
-236
platforms/cuda-old/src/kernels/kVerletUpdate.cu
platforms/cuda-old/src/kernels/kVerletUpdate.cu
+0
-146
platforms/cuda-old/src/kernels/kVerletUpdate.h
platforms/cuda-old/src/kernels/kVerletUpdate.h
+0
-211
platforms/cuda-old/src/kernels/rng.cpp
platforms/cuda-old/src/kernels/rng.cpp
+0
-487
No files found.
Too many changes to show.
To preserve performance only
147 of 147+
files are displayed.
Plain diff
Email patch
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateGBVIForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateGBVIForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
#include "kCalculateGBVIAux.h"
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVIForces2.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVIForces2.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVIForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVIForces2.h"
void
kCalculateGBVIForces2
(
gpuContext
gpu
)
{
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateGBVIN2ByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateGBVIN2Forces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
break
;
case
CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateGBVICutoffByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateGBVICutoffForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
case
PERIODIC
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateGBVIPeriodicByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateGBVIPeriodicForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
}
LAUNCHERROR
(
"kCalculateGBVIForces2"
);
//kPrintGBVI( gpu, "kCalculateGBVIForces2", 0, stderr);
}
platforms/cuda-old/src/kernels/kCalculateGBVIForces2.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "kCalculateGBVIAux.h"
/**
* This file contains the kernel for evalauating the second stage of GBSA. It is included
* several times in kCalculateGBVIForces2.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#endif
METHOD_NAME
(
kCalculateGBVI
,
Forces2_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
cSim
.
bornForce2_blocks
*
cSim
.
bornForce2_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
#ifdef USE_CUTOFF
volatile
float3
*
tempBuffer
=
(
float3
*
)
&
sA
[
cSim
.
bornForce2_threads_per_block
];
#endif
unsigned
int
lasty
=
-
0xFFFFFFFF
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float4
ar
=
cSim
.
pGBVIData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0
.
0
f
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
(
tgx
+
1
)
&
(
GRID
-
1
);
j
!=
tgx
;
j
=
(
j
+
1
)
&
(
GRID
-
1
))
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
// Atom I Born forces and sum
float
dE
=
getGBVI_dE2
(
r
,
ar
.
x
,
psA
[
j
].
sr
,
fb
);
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
}
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
}
else
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float4
temp1
=
cSim
.
pGBVIData
[
j
];
float
fb
=
cSim
.
pBornForce
[
j
];
sA
[
threadIdx
.
x
].
fb
=
fb
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
}
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
//else if (flags)
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
float
dE
=
getGBVI_dE2
(
r
,
ar
.
x
,
psA
[
tj
].
sr
,
fb
);
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// Atom J Born sum term
dE
=
getGBVI_dE2
(
r
,
psA
[
tj
].
r
,
ar
.
y
,
psA
[
tj
].
fb
);
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
// Interleaved Atom I and J Born Forces and sum components
float
dE
=
getGBVI_dE2
(
r
,
ar
.
x
,
psA
[
j
].
sr
,
fb
);
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
tempBuffer
[
threadIdx
.
x
].
x
=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
tempBuffer
[
threadIdx
.
x
].
y
=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
tempBuffer
[
threadIdx
.
x
].
z
=
d
;
// Atom J Born sum term
dE
=
getGBVI_dE2
(
r
,
psA
[
j
].
r
,
ar
.
y
,
psA
[
j
].
fb
);
#if defined USE_CUTOFF
if
(
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
tempBuffer
[
threadIdx
.
x
].
x
+=
dx
;
tempBuffer
[
threadIdx
.
x
].
y
+=
dy
;
tempBuffer
[
threadIdx
.
x
].
z
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// Sum the forces on atom j.
if
(
tgx
%
2
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
1
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
1
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
1
].
z
;
}
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
2
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
2
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
2
].
z
;
}
if
(
tgx
%
8
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
4
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
4
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
4
].
z
;
}
if
(
tgx
%
16
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
8
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
8
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
8
].
z
;
}
if
(
tgx
==
0
)
{
psA
[
j
].
fx
+=
tempBuffer
[
threadIdx
.
x
].
x
+
tempBuffer
[
threadIdx
.
x
+
16
].
x
;
psA
[
j
].
fy
+=
tempBuffer
[
threadIdx
.
x
].
y
+
tempBuffer
[
threadIdx
.
x
+
16
].
y
;
psA
[
j
].
fz
+=
tempBuffer
[
threadIdx
.
x
].
z
+
tempBuffer
[
threadIdx
.
x
+
16
].
z
;
}
}
}
}
#endif
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
#else
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
}
lasty
=
y
;
pos
++
;
}
}
platforms/cuda-old/src/kernels/kCalculateLocalForces.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
extern
__shared__
Vectors
sV
[];
static
__constant__
cudaGmxSimulation
cSim
;
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{
\
dp
=
DOT3
(
v1
,
v2
);
\
float
norm1
=
DOT3
(
v1
,
v1
);
\
float
norm2
=
DOT3
(
v2
,
v2
);
\
dp
/=
sqrtf
(
norm1
*
norm2
);
\
dp
=
min
(
dp
,
1.0
f
);
\
dp
=
max
(
dp
,
-
1.0
f
);
\
}
#define CROSS_PRODUCT(v1, v2, c) \
c
.
x
=
v1
.
y
*
v2
.
z
-
v1
.
z
*
v2
.
y
;
\
c
.
y
=
v1
.
z
*
v2
.
x
-
v1
.
x
*
v2
.
z
;
\
c
.
z
=
v1
.
x
*
v2
.
y
-
v1
.
y
*
v2
.
x
;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acosf
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
LOCAL_HACK_PI
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
;
\
}
#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acosf
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
LOCAL_HACK_PI
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
*
deltaIdeal
;
\
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{
\
float
dp
;
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
dp
);
\
if
(
dp
>
0.99
f
||
dp
<
-
0.99
f
)
{
\
float4
cross
;
\
CROSS_PRODUCT
(
v1
,
v2
,
cross
);
\
float
scale
=
DOT3
(
v1
,
v1
)
*
DOT3
(
v2
,
v2
);
\
angle
=
asinf
(
sqrtf
(
DOT3
(
cross
,
cross
)
/
scale
));
\
if
(
dp
<
0.0
f
)
\
angle
=
LOCAL_HACK_PI
-
angle
;
\
}
\
else
{
\
angle
=
acosf
(
dp
);
\
}
\
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
cosine
=
cosf
(
angle
);
\
}
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_LOCALFORCES_THREADS_PER_BLOCK
,
1
)
#endif
void
kCalculateLocalForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
Vectors
*
A
=
&
sV
[
threadIdx
.
x
];
float
energy
=
0.0
f
;
while
(
pos
<
cSim
.
bond_offset
)
{
if
(
pos
<
cSim
.
bonds
)
{
int4
atom
=
cSim
.
pBondID
[
pos
];
float4
atomA
=
cSim
.
pPosq
[
atom
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom
.
y
];
float2
bond
=
cSim
.
pBondParameter
[
pos
];
float
dx
=
atomB
.
x
-
atomA
.
x
;
float
dy
=
atomB
.
y
-
atomA
.
y
;
float
dz
=
atomB
.
z
-
atomA
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
float
deltaIdeal
=
r
-
bond
.
x
;
/* E */
energy
+=
0.5
f
*
bond
.
y
*
deltaIdeal
*
deltaIdeal
;
float
dEdR
=
bond
.
y
*
deltaIdeal
;
dEdR
=
(
r
>
0.0
f
)
?
(
dEdR
/
r
)
:
0.0
f
;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
cSim
.
pForce4
[
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
bond_angle_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_offset
;
if
(
pos1
<
cSim
.
bond_angles
)
{
int4
atom1
=
cSim
.
pBondAngleID1
[
pos1
];
float2
bond_angle
=
cSim
.
pBondAngleParameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom1
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom1
.
y
];
float4
a3
=
cSim
.
pPosq
[
atom1
.
z
];
A
->
v0
.
x
=
a2
.
x
-
a1
.
x
;
A
->
v0
.
y
=
a2
.
y
-
a1
.
y
;
A
->
v0
.
z
=
a2
.
z
-
a1
.
z
;
A
->
v1
.
x
=
a2
.
x
-
a3
.
x
;
A
->
v1
.
y
=
a2
.
y
-
a3
.
y
;
A
->
v1
.
z
=
a2
.
z
-
a3
.
z
;
float3
cp
;
CROSS_PRODUCT
(
A
->
v0
,
A
->
v1
,
cp
);
float
rp
=
DOT3
(
cp
,
cp
);
//cx * cx + cy * cy + cz * cz;
rp
=
max
(
sqrtf
(
rp
),
1.0e-06
f
);
float
r21
=
DOT3
(
A
->
v0
,
A
->
v0
);
// dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float
r23
=
DOT3
(
A
->
v1
,
A
->
v1
);
// dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float
dot
=
DOT3
(
A
->
v0
,
A
->
v1
);
// dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float
cosine
=
max
(
-
1.0
f
,
min
(
1.0
f
,
dot
/
sqrtf
(
r21
*
r23
)));
float
angle_energy
;
/* E */
GETENERGYGIVENANGLECOSINE
(
cosine
,
bond_angle
,
angle_energy
);
energy
+=
0.5
f
*
angle_energy
;
float
dEdR
;
GETPREFACTORSGIVENANGLECOSINE
(
cosine
,
bond_angle
,
dEdR
);
//printf("%11.4f %11.4f\n", cosine, dEdR);
float
termA
=
dEdR
/
(
r21
*
rp
);
float
termC
=
-
dEdR
/
(
r23
*
rp
);
float3
c21
;
float3
c23
;
CROSS_PRODUCT
(
A
->
v0
,
cp
,
c21
);
CROSS_PRODUCT
(
A
->
v1
,
cp
,
c23
);
c21
.
x
*=
termA
;
c21
.
y
*=
termA
;
c21
.
z
*=
termA
;
c23
.
x
*=
termC
;
c23
.
y
*=
termC
;
c23
.
z
*=
termC
;
int2
atom2
=
cSim
.
pBondAngleID2
[
pos1
];
unsigned
int
offset
=
atom1
.
x
+
atom1
.
w
*
cSim
.
stride
;
float4
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c21
.
x
;
force
.
y
+=
c21
.
y
;
force
.
z
+=
c21
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
y
+
atom2
.
x
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
-=
(
c21
.
x
+
c23
.
x
);
force
.
y
-=
(
c21
.
y
+
c23
.
y
);
force
.
z
-=
(
c21
.
z
+
c23
.
z
);
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
z
+
atom2
.
y
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c23
.
x
;
force
.
y
+=
c23
.
y
;
force
.
z
+=
c23
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_angle_offset
;
if
(
pos1
<
cSim
.
dihedrals
)
{
int4
atom1
=
cSim
.
pDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
);
float4
dihedral
=
cSim
.
pDihedralParameter
[
pos1
];
float
deltaAngle
=
dihedral
.
z
*
dihedralAngle
-
(
dihedral
.
y
*
LOCAL_HACK_PI
/
180.0
f
);
// ATTENTION: This section leads to a divergent deltaAngle values wrt
// forces and energies. We separate the case dihedral.z = n = 0, which
// is treated by the calculation of energies via a harmonic potential
/* E */
if
(
dihedral
.
z
)
energy
+=
dihedral
.
x
*
(
1.0
f
+
cosf
(
deltaAngle
));
/* E */
else
{
float
deltaAngle
=
dihedralAngle
-
dihedral
.
y
;
if
(
deltaAngle
<
-
LOCAL_HACK_PI
)
deltaAngle
+=
2.0
f
*
LOCAL_HACK_PI
;
else
if
(
deltaAngle
>
LOCAL_HACK_PI
)
deltaAngle
-=
2.0
f
*
LOCAL_HACK_PI
;
energy
+=
dihedral
.
x
*
deltaAngle
*
deltaAngle
;
}
float
sinDeltaAngle
=
sinf
(
deltaAngle
);
float
dEdAngle
=
-
dihedral
.
x
*
dihedral
.
z
*
sinDeltaAngle
;
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrtf
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Ryckaert Bellemans dihedrals
while
(
pos
<
cSim
.
rb_dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
dihedral_offset
;
if
(
pos1
<
cSim
.
rb_dihedrals
)
{
int4
atom1
=
cSim
.
pRbDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
,
cosPhi
;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
,
cosPhi
);
if
(
dihedralAngle
<
0.0
f
)
{
dihedralAngle
+=
LOCAL_HACK_PI
;
}
else
{
dihedralAngle
-=
LOCAL_HACK_PI
;
}
cosPhi
=
-
cosPhi
;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4
dihedral1
=
cSim
.
pRbDihedralParameter1
[
pos1
];
float2
dihedral2
=
cSim
.
pRbDihedralParameter2
[
pos1
];
float
cosFactor
=
cosPhi
;
float
dEdAngle
=
-
dihedral1
.
y
;
/* E */
float
rb_energy
=
dihedral1
.
x
;
rb_energy
+=
dihedral1
.
y
*
cosFactor
;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle
-=
2.0
f
*
dihedral1
.
z
*
cosFactor
;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
3.0
f
*
dihedral1
.
w
*
cosFactor
;
rb_energy
+=
dihedral1
.
z
*
cosFactor
;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
4.0
f
*
dihedral2
.
x
*
cosFactor
;
rb_energy
+=
dihedral1
.
w
*
cosFactor
;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
5.0
f
*
dihedral2
.
y
*
cosFactor
;
rb_energy
+=
dihedral2
.
x
*
cosFactor
;
rb_energy
+=
dihedral2
.
y
*
cosFactor
*
cosPhi
;
/* E */
energy
+=
rb_energy
;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle
*=
sinf
(
dihedralAngle
);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrtf
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pRbDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrtf
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
/* E */
energy
+=
LJ14
.
x
*
(
sig6
-
1.0
f
)
*
sig6
;
energy
+=
LJ14
.
z
*
inverseR
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
void
kCalculateLocalForces
(
gpuContext
gpu
)
{
// printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
localForces_threads_per_block
,
gpu
->
sim
.
localForces_threads_per_block
*
sizeof
(
Vectors
)
>>>
();
LAUNCHERROR
(
"kCalculateLocalForces"
);
}
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sum
;
float
padding
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaBornSum.h"
__global__
__launch_bounds__
(
384
,
1
)
void
kReduceObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
;
float2
atom
=
cSim
.
pObcData
[
pos
];
// Get summed Born data
for
(
int
i
=
0
;
i
<
cSim
.
nonbondOutputBuffers
;
i
++
)
{
sum
+=
*
pSt
;
pSt
+=
cSim
.
stride
;
}
// Now calculate Born radius and OBC term.
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
OPENMMCUDA_EXPORT
kReduceObcGbsaBornSum
(
gpuContext
gpu
)
{
kReduceObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
gpu
->
bRecalculateBornRadii
=
false
;
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
void
kPrintObc
(
gpuContext
gpu
,
std
::
string
callId
,
int
call
,
FILE
*
log
)
{
gpu
->
psObcData
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psObcChain
->
Download
();
gpu
->
psBornForce
->
Download
();
gpu
->
psPosq4
->
Download
();
gpu
->
psSigEps2
->
Download
();
(
void
)
fprintf
(
log
,
"kPrintObc Cuda bCh bR bF prm[2] sigeps[2]
\n
"
);
(
void
)
fprintf
(
stderr
,
"bOutputWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u
\n
"
,
gpu
->
bOutputBufferPerWarp
,
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
gpu
->
sim
.
workUnits
,
gpu
->
psWorkUnit
->
_pSysStream
[
0
][
0
],
static_cast
<
unsigned
int
>
(
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
)
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
sim
.
paddedNumberOfAtoms
;
ii
++
){
(
void
)
fprintf
(
log
,
"%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e
\n
"
,
ii
,
gpu
->
psObcChain
->
_pSysData
[
ii
],
gpu
->
psBornRadii
->
_pSysData
[
ii
],
gpu
->
psBornForce
->
_pSysData
[
ii
],
gpu
->
psObcData
->
_pSysData
[
ii
].
x
,
gpu
->
psObcData
->
_pSysData
[
ii
].
y
,
gpu
->
psSigEps2
->
_pSysData
[
ii
].
x
,
gpu
->
psSigEps2
->
_pSysData
[
ii
].
y
);
}
}
void
OPENMMCUDA_EXPORT
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kCalculateObcgbsaBornSum\n");
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateObcGbsaN2BornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
break
;
case
CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateObcGbsaCutoffBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
case
PERIODIC
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateObcGbsaPeriodicBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float
))
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
}
LAUNCHERROR
(
"kCalculateBornSum"
);
}
platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_NONBOND_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_NONBOND_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateObcGbsa
,
BornSum_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
cSim
.
nonbond_blocks
*
cSim
.
nonbond_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
#ifdef USE_CUTOFF
volatile
float
*
tempBuffer
=
(
volatile
float
*
)
&
sA
[
cSim
.
nonbond_threads_per_block
];
#endif
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0
.
0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#if defined USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
x
+
j
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
<
cSim
.
atoms
&&
x
+
j
<
cSim
.
atoms
)
#endif
{
r
=
sqrtf
(
r2
);
float
rInverse
=
1
.
0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1
.
0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
logf
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0
.
25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0
.
50
f
*
rInverse
*
ratio
)
+
(
0
.
25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
float
rj
=
psA
[
j
].
sr
;
if
(
ar
.
x
<
(
rj
-
r
))
{
apos
.
w
+=
2
.
0
f
*
((
1
.
0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
apos
.
w
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
#endif
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0
.
0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#ifdef USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
y
+
tj
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
<
cSim
.
atoms
&&
y
+
tj
<
cSim
.
atoms
)
#endif
{
r
=
sqrtf
(
r2
);
float
rInverse
=
1
.
0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1
.
0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
logf
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0
.
25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0
.
50
f
*
rInverse
*
ratio
)
+
(
0
.
25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
float
srj
=
psA
[
tj
].
sr
;
if
(
ar
.
x
<
(
srj
-
r
))
{
term
+=
2
.
0
f
*
((
1
.
0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1
.
0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
logf
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0
.
25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0
.
50
f
*
rInverse
*
ratio
)
+
(
0
.
25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
float
rj
=
psA
[
tj
].
r
;
if
(
rj
<
(
ar
.
y
-
r
))
{
term
+=
2
.
0
f
*
((
1
.
0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
}
tj
=
(
tj
-
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
tempBuffer
[
threadIdx
.
x
]
=
0
.
0
f
;
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
#ifdef USE_CUTOFF
if
(
i
<
cSim
.
atoms
&&
y
+
j
<
cSim
.
atoms
&&
r2
<
cSim
.
nonbondedCutoffSqr
)
#else
if
(
i
<
cSim
.
atoms
&&
y
+
j
<
cSim
.
atoms
)
#endif
{
r
=
sqrtf
(
r2
);
float
rInverse
=
1
.
0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1
.
0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
logf
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0
.
25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0
.
50
f
*
rInverse
*
ratio
)
+
(
0
.
25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
float
srj
=
psA
[
j
].
sr
;
if
(
ar
.
x
<
(
srj
-
r
))
{
term
+=
2
.
0
f
*
((
1
.
0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
j
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1
.
0
f
/
max
(
psA
[
j
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
logf
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0
.
25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0
.
50
f
*
rInverse
*
ratio
)
+
(
0
.
25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
float
rj
=
psA
[
j
].
r
;
if
(
rj
<
(
ar
.
y
-
r
))
{
term
+=
2
.
0
f
*
((
1
.
0
f
/
psA
[
j
].
r
)
-
l_ij
);
}
tempBuffer
[
threadIdx
.
x
]
=
term
;
}
}
// Sum the terms.
if
(
tgx
%
2
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
];
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
2
];
if
(
tgx
%
8
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
4
];
if
(
tgx
%
16
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
8
];
if
(
tgx
==
0
)
psA
[
j
].
sum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
16
];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
apos
.
w
;
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
+=
sA
[
threadIdx
.
x
].
sum
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
#endif
}
pos
++
;
}
}
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaForces2.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaForces2.h"
void
OPENMMCUDA_EXPORT
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces2\n");
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
case
NO_CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaN2ByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
else
kCalculateObcGbsaN2Forces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pWorkUnit
);
break
;
case
CUTOFF
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaCutoffByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateObcGbsaCutoffForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
case
PERIODIC
:
if
(
gpu
->
bOutputBufferPerWarp
)
kCalculateObcGbsaPeriodicByWarpForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
else
kCalculateObcGbsaPeriodicForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
,
(
sizeof
(
Atom
)
+
sizeof
(
float3
))
*
gpu
->
sim
.
bornForce2_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
break
;
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
//kPrintObc( gpu, "Post kCalculateObcGbsaForces2", 0, stderr );
}
platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evalauating the second stage of GBSA. It is included
* several times in kCalculateObcGbsaForces2.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_BORNFORCE2_THREADS_PER_BLOCK
,
1
)
#endif
void
METHOD_NAME
(
kCalculateObcGbsa
,
Forces2_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
Atom
sA
[];
unsigned
int
totalWarps
=
cSim
.
bornForce2_blocks
*
cSim
.
bornForce2_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
#ifdef USE_CUTOFF
volatile
float3
*
tempBuffer
=
(
volatile
float3
*
)
&
sA
[
cSim
.
bornForce2_threads_per_block
];
#endif
unsigned
int
lasty
=
-
0xFFFFFFFF
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
tj
=
tgx
;
volatile
Atom
*
psA
=
&
sA
[
tbx
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0
.
0
f
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
(
tgx
+
1
)
&
(
GRID
-
1
);
j
!=
tgx
;
j
=
(
j
+
1
)
&
(
GRID
-
1
))
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1
.
0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1
.
0
f
/
rScaledRadiusJ
;
float
rInverse
=
1
.
0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
logf
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0
.
125
f
*
(
1
.
000
f
+
psA
[
j
].
sr
*
psA
[
j
].
sr
*
r2Inverse
)
*
t3
+
0
.
250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
#if defined USE_PERIODIC
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#elif defined USE_CUTOFF
if
(
a
.
x
>=
rScaledRadiusJ
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
x
+
j
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
}
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
}
else
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
}
float
sr2
=
a
.
y
*
a
.
y
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
if
(
flags
==
0xFFFFFFFF
)
#endif
{
// Compute all interactions within this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
// Interleaved Atom I and J Born Forces and sum components
float
r2Inverse
=
1
.
0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rScaledRadiusI
=
r
+
a
.
y
;
float
rInverse
=
1
.
0
f
/
r
;
float
l_ijJ
=
1
.
0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
l_ijI
=
1
.
0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
float
u_ijJ
=
1
.
0
f
/
rScaledRadiusJ
;
float
u_ijI
=
1
.
0
f
/
rScaledRadiusI
;
float
l_ij2J
=
l_ijJ
*
l_ijJ
;
float
l_ij2I
=
l_ijI
*
l_ijI
;
float
u_ij2J
=
u_ijJ
*
u_ijJ
;
float
u_ij2I
=
u_ijI
*
u_ijI
;
float
t1J
=
logf
(
u_ijJ
/
l_ijJ
);
float
t1I
=
logf
(
u_ijI
/
l_ijI
);
float
t2J
=
(
l_ij2J
-
u_ij2J
);
float
t2I
=
(
l_ij2I
-
u_ij2I
);
float
t3J
=
t2J
*
rInverse
;
float
t3I
=
t2I
*
rInverse
;
t1J
*=
rInverse
;
t1I
*=
rInverse
;
// Born Forces term
float
term
=
0
.
125
f
*
(
1
.
000
f
+
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
r2Inverse
)
*
t3J
+
0
.
250
f
*
t1J
*
r2Inverse
;
float
dE
=
fb
*
term
;
#if defined USE_PERIODIC
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#elif defined USE_CUTOFF
if
(
a
.
x
>=
rScaledRadiusJ
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// Atom J Born sum term
term
=
0
.
125
f
*
(
1
.
000
f
+
sr2
*
r2Inverse
)
*
t3I
+
0
.
250
f
*
t1I
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
float
rj
=
psA
[
tj
].
r
;
#ifdef USE_PERIODIC
if
(
rj
>=
rScaledRadiusI
||
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#elif defined USE_CUTOFF
if
(
rj
>=
rScaledRadiusI
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
rj
>=
rScaledRadiusI
||
i
>=
cSim
.
atoms
||
y
+
tj
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrtf
(
r2
);
// Interleaved Atom I and J Born Forces and sum components
float
r2Inverse
=
1
.
0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
rScaledRadiusI
=
r
+
a
.
y
;
float
rInverse
=
1
.
0
f
/
r
;
float
l_ijJ
=
1
.
0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
l_ijI
=
1
.
0
f
/
max
(
psA
[
j
].
r
,
fabs
(
r
-
a
.
y
));
float
u_ijJ
=
1
.
0
f
/
rScaledRadiusJ
;
float
u_ijI
=
1
.
0
f
/
rScaledRadiusI
;
float
l_ij2J
=
l_ijJ
*
l_ijJ
;
float
l_ij2I
=
l_ijI
*
l_ijI
;
float
u_ij2J
=
u_ijJ
*
u_ijJ
;
float
u_ij2I
=
u_ijI
*
u_ijI
;
float
t1J
=
logf
(
u_ijJ
/
l_ijJ
);
float
t1I
=
logf
(
u_ijI
/
l_ijI
);
float
t2J
=
(
l_ij2J
-
u_ij2J
);
float
t2I
=
(
l_ij2I
-
u_ij2I
);
float
t3J
=
t2J
*
rInverse
;
float
t3I
=
t2I
*
rInverse
;
t1J
*=
rInverse
;
t1I
*=
rInverse
;
// Born Forces term
float
term
=
0
.
125
f
*
(
1
.
000
f
+
psA
[
j
].
sr
*
psA
[
j
].
sr
*
r2Inverse
)
*
t3J
+
0
.
250
f
*
t1J
*
r2Inverse
;
float
dE
=
fb
*
term
;
#if defined USE_PERIODIC
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#elif defined USE_CUTOFF
if
(
a
.
x
>=
rScaledRadiusJ
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
a
.
x
>=
rScaledRadiusJ
||
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
)
#endif
{
dE
=
0
.
0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
tempBuffer
[
threadIdx
.
x
].
x
=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
tempBuffer
[
threadIdx
.
x
].
y
=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
tempBuffer
[
threadIdx
.
x
].
z
=
d
;
// Atom J Born sum term
term
=
0
.
125
f
*
(
1
.
000
f
+
sr2
*
r2Inverse
)
*
t3I
+
0
.
250
f
*
t1I
*
r2Inverse
;
dE
=
psA
[
j
].
fb
*
term
;
float
rj
=
psA
[
j
].
r
;
#ifdef USE_PERIODIC
if
(
rj
>=
rScaledRadiusI
||
i
>=
cSim
.
atoms
||
y
+
j
>=
cSim
.
atoms
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#elif defined USE_CUTOFF
if
(
rj
>=
rScaledRadiusI
||
r2
>
cSim
.
nonbondedCutoffSqr
)
#else
if
(
rj
>=
rScaledRadiusI
)
#endif
{
dE
=
0
.
0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
tempBuffer
[
threadIdx
.
x
].
x
+=
dx
;
tempBuffer
[
threadIdx
.
x
].
y
+=
dy
;
tempBuffer
[
threadIdx
.
x
].
z
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// Sum the forces on atom j.
if
(
tgx
%
2
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
1
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
1
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
1
].
z
;
}
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
2
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
2
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
2
].
z
;
}
if
(
tgx
%
8
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
4
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
4
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
4
].
z
;
}
if
(
tgx
%
16
==
0
)
{
tempBuffer
[
threadIdx
.
x
].
x
+=
tempBuffer
[
threadIdx
.
x
+
8
].
x
;
tempBuffer
[
threadIdx
.
x
].
y
+=
tempBuffer
[
threadIdx
.
x
+
8
].
y
;
tempBuffer
[
threadIdx
.
x
].
z
+=
tempBuffer
[
threadIdx
.
x
+
8
].
z
;
}
if
(
tgx
==
0
)
{
psA
[
j
].
fx
+=
tempBuffer
[
threadIdx
.
x
].
x
+
tempBuffer
[
threadIdx
.
x
+
16
].
x
;
psA
[
j
].
fy
+=
tempBuffer
[
threadIdx
.
x
].
y
+
tempBuffer
[
threadIdx
.
x
+
16
].
y
;
psA
[
j
].
fz
+=
tempBuffer
[
threadIdx
.
x
].
z
+
tempBuffer
[
threadIdx
.
x
+
16
].
z
;
}
}
}
}
#endif
// Write results
float4
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp
*
cSim
.
stride
;
#else
unsigned
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
af
.
x
;
of
.
y
+=
af
.
y
;
of
.
z
+=
af
.
z
;
cSim
.
pForce4
[
offset
]
=
of
;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset
=
y
+
tgx
+
warp
*
cSim
.
stride
;
#else
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
#endif
of
=
cSim
.
pForce4
[
offset
];
of
.
x
+=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
+=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
+=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4
[
offset
]
=
of
;
}
lasty
=
y
;
pos
++
;
}
}
platforms/cuda-old/src/kernels/kCalculatePME.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "bbsort.h"
#include <cuda.h>
using
namespace
std
;
static
__constant__
cudaGmxSimulation
cSim
;
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
void
SetCalculatePMESim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculatePMESim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
texture
<
float4
,
1
,
cudaReadModeElementType
>
bsplineThetaRef
;
inline
__host__
__device__
int
fast_mod
(
int
a
,
int
b
)
{
return
(
b
&
(
b
-
1
))
?
a
%
b
:
a
&
(
b
-
1
);
}
inline
__host__
__device__
float4
make_float4
(
float
s
)
{
return
make_float4
(
s
,
s
,
s
,
s
);
}
inline
__host__
__device__
float4
operator
-
(
float4
&
a
)
{
return
make_float4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
inline
__host__
__device__
float4
operator
-
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
inline
__host__
__device__
float4
operator
+
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
inline
__host__
__device__
float4
operator
+
(
float4
a
,
float
b
)
{
return
make_float4
(
a
.
x
+
b
,
a
.
y
+
b
,
a
.
z
+
b
,
a
.
w
+
b
);
}
inline
__host__
__device__
float4
operator
+
(
float
a
,
float4
b
)
{
return
make_float4
(
a
+
b
.
x
,
a
+
b
.
y
,
a
+
b
.
z
,
a
+
b
.
w
);
}
inline
__host__
__device__
float4
operator
*
(
float
s
,
float4
a
)
{
return
make_float4
(
a
.
x
*
s
,
a
.
y
*
s
,
a
.
z
*
s
,
a
.
w
*
s
);
}
inline
__host__
__device__
float4
operator
*
(
float4
a
,
float4
b
)
{
return
make_float4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
+
b
.
w
);
}
inline
__host__
__device__
float4
make_float4
(
int3
a
)
{
return
make_float4
((
float
)
a
.
x
,
(
float
)
a
.
y
,
(
float
)
a
.
z
,
0
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kUpdateBsplines_kernel
()
{
unsigned
int
tnb
=
blockDim
.
x
*
gridDim
.
x
;
unsigned
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
extern
__shared__
float4
bsplines_cache
[];
// size = 2 * block_size * pme_order
const
float4
div_o
=
make_float4
(
1.0
f
/
(
PME_ORDER
-
1
));
for
(
int
i
=
tid
;
i
<
cSim
.
atoms
;
i
+=
tnb
)
{
float4
*
data
=
&
bsplines_cache
[
threadIdx
.
x
*
PME_ORDER
];
float4
*
ddata
=
&
bsplines_cache
[
threadIdx
.
x
*
PME_ORDER
+
blockDim
.
x
*
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
data
[
j
]
=
make_float4
(
0.0
f
);
ddata
[
j
]
=
make_float4
(
0.0
f
);
}
float4
posq
=
cSim
.
pPosq
[
i
];
posq
.
x
-=
floorf
(
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
periodicBoxSizeX
;
posq
.
y
-=
floorf
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
periodicBoxSizeY
;
posq
.
z
-=
floorf
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
float3
t
=
make_float3
((
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
pmeGridSize
.
x
,
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
pmeGridSize
.
y
,
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
);
float4
dr
=
make_float4
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
,
0.0
f
);
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
cSim
.
pmeGridSize
.
x
,
((
int
)
t
.
y
)
%
cSim
.
pmeGridSize
.
y
,
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
gridIndex
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
z
);
data
[
PME_ORDER
-
1
]
=
make_float4
(
0.0
f
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_float4
(
1.0
f
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
float
div
=
1.0
f
/
((
float
)
j
-
1.0
f
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
{
data
[
j
-
k
-
1
]
=
div
*
(
(
dr
+
float
(
k
))
*
data
[
j
-
k
-
2
]
+
(
-
dr
+
((
float
)(
j
-
k
)))
*
data
[
j
-
k
-
1
]);
}
data
[
0
]
=
div
*
(
-
dr
+
1
)
*
data
[
0
];
}
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
div_o
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
{
data
[
PME_ORDER
-
j
-
1
]
=
div_o
*
(
(
dr
+
(
float
)
j
)
*
data
[
PME_ORDER
-
j
-
2
]
+
(
-
dr
+
((
float
)(
PME_ORDER
-
j
)))
*
data
[
PME_ORDER
-
j
-
1
]
);
}
data
[
0
]
=
div_o
*
(
-
dr
+
1.0
f
)
*
data
[
0
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
data
[
j
].
w
=
posq
.
w
;
// Storing the charge here improves cache coherency in the charge spreading kernel
cSim
.
pPmeBsplineTheta
[
i
+
j
*
cSim
.
atoms
]
=
data
[
j
];
cSim
.
pPmeBsplineDtheta
[
i
+
j
*
cSim
.
atoms
]
=
ddata
[
j
];
}
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kFindAtomRangeForGrid_kernel
()
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
cSim
.
atoms
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
cSim
.
atoms
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
int
last
=
(
start
==
0
?
-
1
:
cSim
.
pPmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last
+
1
;
j
<=
gridIndex
;
++
j
)
cSim
.
pPmeAtomRange
[
j
]
=
i
;
last
=
gridIndex
;
}
}
// Fill in values beyond the last atom.
if
(
thread
==
blockDim
.
x
*
gridDim
.
x
-
1
)
{
int
gridSize
=
cSim
.
pmeGridSize
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
for
(
int
j
=
last
+
1
;
j
<=
gridSize
;
++
j
)
cSim
.
pPmeAtomRange
[
j
]
=
cSim
.
atoms
;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kRecordZIndex_kernel
()
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
cSim
.
atoms
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
cSim
.
atoms
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
float
posz
=
cSim
.
pPosq
[
atomData
.
x
].
z
;
posz
-=
floorf
(
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
int
z
=
((
int
)
((
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
))
%
cSim
.
pmeGridSize
.
z
;
cSim
.
pPmeAtomGridIndex
[
i
].
y
=
z
;
}
}
__global__
void
kGridSpreadCharge_kernel
()
{
unsigned
int
numGridPoints
=
cSim
.
pmeGridSize
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
int3
gridPoint
;
gridPoint
.
x
=
gridIndex
/
(
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
y
=
remainder
/
cSim
.
pmeGridSize
.
z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
cSim
.
pmeGridSize
.
z
;
float
result
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
cSim
.
pmeGridSize
.
x
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
cSim
.
pmeGridSize
.
y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
cSim
.
pmeGridSize
.
z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
cSim
.
pmeGridSize
.
z
-
1
);
int
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z1
;
int
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z2
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
;
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
gridPoint
.
z
;
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
}
}
}
cSim
.
pPmeGrid
[
gridIndex
]
=
make_cuComplex
(
result
*
sqrtf
(
cSim
.
epsfac
),
0.0
f
);
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kReciprocalConvolution_kernel
()
{
const
unsigned
int
gridSize
=
cSim
.
pmeGridSize
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
float
expFactor
=
LOCAL_HACK_PI
*
LOCAL_HACK_PI
/
(
cSim
.
alphaEwald
*
cSim
.
alphaEwald
);
float
scaleFactor
=
1.0
/
(
LOCAL_HACK_PI
*
cSim
.
periodicBoxSizeX
*
cSim
.
periodicBoxSizeY
*
cSim
.
periodicBoxSizeZ
);
float
energy
=
0.0
f
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
kx
=
index
/
(
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
);
int
remainder
=
index
-
kx
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
int
ky
=
remainder
/
cSim
.
pmeGridSize
.
z
;
int
kz
=
remainder
-
ky
*
cSim
.
pmeGridSize
.
z
;
if
(
kx
==
0
&&
ky
==
0
&&
kz
==
0
)
continue
;
int
mx
=
(
kx
<
(
cSim
.
pmeGridSize
.
x
+
1
)
/
2
)
?
kx
:
(
kx
-
cSim
.
pmeGridSize
.
x
);
int
my
=
(
ky
<
(
cSim
.
pmeGridSize
.
y
+
1
)
/
2
)
?
ky
:
(
ky
-
cSim
.
pmeGridSize
.
y
);
int
mz
=
(
kz
<
(
cSim
.
pmeGridSize
.
z
+
1
)
/
2
)
?
kz
:
(
kz
-
cSim
.
pmeGridSize
.
z
);
float
mhx
=
mx
*
cSim
.
invPeriodicBoxSizeX
;
float
mhy
=
my
*
cSim
.
invPeriodicBoxSizeY
;
float
mhz
=
mz
*
cSim
.
invPeriodicBoxSizeZ
;
float
bx
=
cSim
.
pPmeBsplineModuli
[
0
][
kx
];
float
by
=
cSim
.
pPmeBsplineModuli
[
1
][
ky
];
float
bz
=
cSim
.
pPmeBsplineModuli
[
2
][
kz
];
cuComplex
grid
=
cSim
.
pPmeGrid
[
index
];
float
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
float
denom
=
m2
*
bx
*
by
*
bz
;
float
eterm
=
scaleFactor
*
exp
(
-
expFactor
*
m2
)
/
denom
;
cSim
.
pPmeGrid
[
index
]
=
make_cuComplex
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kGridInterpolateForce_kernel
()
{
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
cSim
.
atoms
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
float3
force
=
make_float3
(
0.0
f
,
0.0
f
,
0.0
f
);
float4
posq
=
cSim
.
pPosq
[
atom
];
posq
.
x
-=
floorf
(
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
periodicBoxSizeX
;
posq
.
y
-=
floorf
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
periodicBoxSizeY
;
posq
.
z
-=
floorf
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
float3
t
=
make_float3
((
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
pmeGridSize
.
x
,
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
pmeGridSize
.
y
,
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
);
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
cSim
.
pmeGridSize
.
x
,
((
int
)
t
.
y
)
%
cSim
.
pmeGridSize
.
y
,
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xindex
=
gridIndex
.
x
+
ix
;
xindex
-=
(
xindex
>=
cSim
.
pmeGridSize
.
x
?
cSim
.
pmeGridSize
.
x
:
0
);
float
tx
=
cSim
.
pPmeBsplineTheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
float
dtx
=
cSim
.
pPmeBsplineDtheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
yindex
=
gridIndex
.
y
+
iy
;
yindex
-=
(
yindex
>=
cSim
.
pmeGridSize
.
y
?
cSim
.
pmeGridSize
.
y
:
0
);
float
ty
=
cSim
.
pPmeBsplineTheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
float
dty
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
cSim
.
pmeGridSize
.
z
?
cSim
.
pmeGridSize
.
z
:
0
);
float
tz
=
cSim
.
pPmeBsplineTheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
float
dtz
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
int
index
=
xindex
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
yindex
*
cSim
.
pmeGridSize
.
z
+
zindex
;
float
gridvalue
=
cSim
.
pPmeGrid
[
index
].
x
;
force
.
x
+=
dtx
*
ty
*
tz
*
gridvalue
;
force
.
y
+=
tx
*
dty
*
tz
*
gridvalue
;
force
.
z
+=
tx
*
ty
*
dtz
*
gridvalue
;
}
}
}
float4
totalForce
=
cSim
.
pForce4
[
atom
];
float
q
=
posq
.
w
*
sqrtf
(
cSim
.
epsfac
);
totalForce
.
x
-=
q
*
force
.
x
*
cSim
.
pmeGridSize
.
x
*
cSim
.
invPeriodicBoxSizeX
;
totalForce
.
y
-=
q
*
force
.
y
*
cSim
.
pmeGridSize
.
y
*
cSim
.
invPeriodicBoxSizeY
;
totalForce
.
z
-=
q
*
force
.
z
*
cSim
.
pmeGridSize
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
cSim
.
pForce4
[
atom
]
=
totalForce
;
}
}
void
kCalculatePME
(
gpuContext
gpu
)
{
// printf("kCalculatePME\n");
cudaChannelFormatDesc
channelDesc
=
cudaCreateChannelDesc
<
float4
>
();
cudaBindTexture
(
NULL
,
&
bsplineThetaRef
,
gpu
->
psPmeBsplineTheta
->
_pDevData
,
&
channelDesc
,
gpu
->
psPmeBsplineTheta
->
_length
*
sizeof
(
float4
));
unsigned
int
threads
=
16380
/
(
2
*
PME_ORDER
*
sizeof
(
float4
));
kUpdateBsplines_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
2
*
threads
*
PME_ORDER
*
sizeof
(
float4
)
>>>
();
LAUNCHERROR
(
"kUpdateBsplines"
);
bbSort
(
gpu
->
psPmeAtomGridIndex
->
_pDevData
,
gpu
->
natoms
);
kFindAtomRangeForGrid_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindAtomRangeForGrid"
);
kRecordZIndex_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kRecordZIndex"
);
kGridSpreadCharge_kernel
<<<
16
*
gpu
->
sim
.
blocks
,
64
>>>
();
LAUNCHERROR
(
"kGridSpreadCharge"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_FORWARD
);
kReciprocalConvolution_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kReciprocalConvolution"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_INVERSE
);
kGridInterpolateForce_kernel
<<<
2
*
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kGridInterpolateForce"
);
}
platforms/cuda-old/src/kernels/kEvaluateExpression.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the routine for evaluating a custom expression.
*/
static
__constant__
float
globalParams
[
8
];
texture
<
float4
,
1
,
cudaReadModeElementType
>
texRef0
;
texture
<
float4
,
1
,
cudaReadModeElementType
>
texRef1
;
texture
<
float4
,
1
,
cudaReadModeElementType
>
texRef2
;
texture
<
float4
,
1
,
cudaReadModeElementType
>
texRef3
;
#define STACK(y) stack[(y)*blockDim.x+threadIdx.x]
#define VARIABLE(y) variables[(y)*blockDim.x+threadIdx.x]
template
<
int
SIZE
>
__device__
float
kEvaluateExpression_kernel
(
Expression
<
SIZE
>*
expression
,
float
*
stack
,
float
*
variables
)
{
int
stackPointer
=
-
1
;
for
(
int
i
=
0
;
i
<
expression
->
length
;
i
++
)
{
int
op
=
expression
->
op
[
i
];
if
(
op
<
MULTIPLY
)
{
STACK
(
++
stackPointer
)
=
VARIABLE
(
op
-
VARIABLE0
);
}
else
if
(
op
<
NEGATE
)
{
if
(
op
<
MULTIPLY_CONSTANT
)
{
if
(
op
==
MULTIPLY
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
--
stackPointer
)
*=
temp
;
}
else
if
(
op
==
DIVIDE
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
=
temp
/
STACK
(
--
stackPointer
);
}
else
if
(
op
==
ADD
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
--
stackPointer
)
+=
temp
;
}
else
if
(
op
==
SUBTRACT
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
=
temp
-
STACK
(
--
stackPointer
);
}
else
/*if (op == POWER)*/
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
=
pow
(
temp
,
STACK
(
--
stackPointer
));
}
}
else
if
(
op
<
GLOBAL
)
{
if
(
op
==
MULTIPLY_CONSTANT
)
{
STACK
(
stackPointer
)
*=
expression
->
arg
[
i
];
}
else
if
(
op
==
POWER_CONSTANT
)
{
STACK
(
stackPointer
)
=
pow
(
STACK
(
stackPointer
),
expression
->
arg
[
i
]);
}
else
/*if (op == ADD_CONSTANT)*/
{
STACK
(
stackPointer
)
+=
expression
->
arg
[
i
];
}
}
else
{
if
(
op
==
GLOBAL
)
{
STACK
(
++
stackPointer
)
=
globalParams
[(
int
)
expression
->
arg
[
i
]];
}
else
if
(
op
==
CONSTANT
)
{
STACK
(
++
stackPointer
)
=
expression
->
arg
[
i
];
}
else
/*if (op == CUSTOM || op == CUSTOM_DERIV)*/
{
int
function
=
(
int
)
expression
->
arg
[
i
];
float
x
=
STACK
(
stackPointer
);
float4
params
=
cSim
.
pTabulatedFunctionParams
[
function
];
if
(
x
<
params
.
x
||
x
>
params
.
y
)
STACK
(
stackPointer
)
=
0.0
f
;
else
{
x
=
(
x
-
params
.
x
)
*
params
.
z
;
int
index
=
floor
(
x
);
index
=
min
(
index
,
(
int
)
params
.
w
);
float4
coeff
;
if
(
function
==
0
)
coeff
=
tex1Dfetch
(
texRef0
,
index
);
else
if
(
function
==
1
)
coeff
=
tex1Dfetch
(
texRef1
,
index
);
else
if
(
function
==
2
)
coeff
=
tex1Dfetch
(
texRef2
,
index
);
else
coeff
=
tex1Dfetch
(
texRef3
,
index
);
float
b
=
x
-
index
;
float
a
=
1.0
f
-
b
;
if
(
op
==
CUSTOM
)
STACK
(
stackPointer
)
=
a
*
coeff
.
x
+
b
*
coeff
.
y
+
((
a
*
a
*
a
-
a
)
*
coeff
.
z
+
(
b
*
b
*
b
-
b
)
*
coeff
.
w
)
/
(
params
.
z
*
params
.
z
);
else
STACK
(
stackPointer
)
=
(
coeff
.
y
-
coeff
.
x
)
*
params
.
z
+
((
1.0
f
-
3.0
f
*
a
*
a
)
*
coeff
.
z
+
(
3.0
f
*
b
*
b
-
1.0
f
)
*
coeff
.
w
)
/
params
.
z
;
}
}
}
}
else
{
if
(
op
<
SIN
)
{
if
(
op
==
NEGATE
)
{
STACK
(
stackPointer
)
*=
-
1.0
f
;
}
else
if
(
op
==
RECIPROCAL
)
{
STACK
(
stackPointer
)
=
1.0
f
/
STACK
(
stackPointer
);
}
else
if
(
op
==
SQRT
)
{
STACK
(
stackPointer
)
=
sqrt
(
STACK
(
stackPointer
));
}
else
if
(
op
==
EXP
)
{
STACK
(
stackPointer
)
=
exp
(
STACK
(
stackPointer
));
}
else
if
(
op
==
LOG
)
{
STACK
(
stackPointer
)
=
log
(
STACK
(
stackPointer
));
}
else
if
(
op
==
SQUARE
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
*=
temp
;
}
else
if
(
op
==
CUBE
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
*=
temp
*
temp
;
}
else
/*if (op == STEP)*/
{
STACK
(
stackPointer
)
=
(
STACK
(
stackPointer
)
>=
0.0
f
?
1.0
f
:
0.0
f
);
}
}
else
{
if
(
op
==
SIN
)
{
STACK
(
stackPointer
)
=
sin
(
STACK
(
stackPointer
));
}
else
if
(
op
==
COS
)
{
STACK
(
stackPointer
)
=
cos
(
STACK
(
stackPointer
));
}
else
if
(
op
==
SEC
)
{
STACK
(
stackPointer
)
=
1.0
f
/
cos
(
STACK
(
stackPointer
));
}
else
if
(
op
==
CSC
)
{
STACK
(
stackPointer
)
=
1.0
f
/
sin
(
STACK
(
stackPointer
));
}
else
if
(
op
==
TAN
)
{
STACK
(
stackPointer
)
=
tan
(
STACK
(
stackPointer
));
}
else
if
(
op
==
COT
)
{
STACK
(
stackPointer
)
=
1.0
f
/
tan
(
STACK
(
stackPointer
));
}
else
if
(
op
==
ASIN
)
{
STACK
(
stackPointer
)
=
asin
(
STACK
(
stackPointer
));
}
else
if
(
op
==
ACOS
)
{
STACK
(
stackPointer
)
=
acos
(
STACK
(
stackPointer
));
}
else
if
(
op
==
ATAN
)
{
STACK
(
stackPointer
)
=
atan
(
STACK
(
stackPointer
));
}
else
if
(
op
==
SINH
)
{
STACK
(
stackPointer
)
=
sinh
(
STACK
(
stackPointer
));
}
else
if
(
op
==
COSH
)
{
STACK
(
stackPointer
)
=
cosh
(
STACK
(
stackPointer
));
}
else
if
(
op
==
TANH
)
{
STACK
(
stackPointer
)
=
tanh
(
STACK
(
stackPointer
));
}
else
if
(
op
==
ERF
)
{
STACK
(
stackPointer
)
=
erf
(
STACK
(
stackPointer
));
}
else
if
(
op
==
ERFC
)
{
STACK
(
stackPointer
)
=
erfc
(
STACK
(
stackPointer
));
}
else
if
(
op
==
MIN
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
=
min
(
temp
,
STACK
(
--
stackPointer
));
}
else
if
(
op
==
MAX
)
{
float
temp
=
STACK
(
stackPointer
);
STACK
(
stackPointer
)
=
max
(
temp
,
STACK
(
--
stackPointer
));
}
else
/*if (op == ABS)*/
{
STACK
(
stackPointer
)
=
fabs
(
STACK
(
stackPointer
));
}
}
}
}
return
STACK
(
stackPointer
);
}
platforms/cuda-old/src/kernels/kFindInteractingBlocks.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for identifying interacting blocks. It is included
* several times in kCalculateCDLJForces.cu with different #defines to generate
* different versions of the kernels.
*/
/**
* Find a bounding box for the atoms in each block.
*/
__global__
void
METHOD_NAME
(
kFindBlockBounds
,
_kernel
)()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
base
=
pos
<<
GRIDBITS
;
if
(
base
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
base
];
#ifdef USE_PERIODIC
apos
.
x
-=
floorf
(
apos
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
periodicBoxSizeX
;
apos
.
y
-=
floorf
(
apos
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
periodicBoxSizeY
;
apos
.
z
-=
floorf
(
apos
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
float4
firstPoint
=
apos
;
#endif
float
minx
=
apos
.
x
;
float
maxx
=
apos
.
x
;
float
miny
=
apos
.
y
;
float
maxy
=
apos
.
y
;
float
minz
=
apos
.
z
;
float
maxz
=
apos
.
z
;
for
(
unsigned
int
i
=
1
;
i
<
GRID
;
i
++
)
{
apos
=
cSim
.
pPosq
[
base
+
i
];
#ifdef USE_PERIODIC
apos
.
x
-=
floorf
((
apos
.
x
-
firstPoint
.
x
)
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
apos
.
y
-=
floorf
((
apos
.
y
-
firstPoint
.
y
)
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
apos
.
z
-=
floorf
((
apos
.
z
-
firstPoint
.
z
)
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
minx
=
min
(
minx
,
apos
.
x
);
maxx
=
max
(
maxx
,
apos
.
x
);
miny
=
min
(
miny
,
apos
.
y
);
maxy
=
max
(
maxy
,
apos
.
y
);
minz
=
min
(
minz
,
apos
.
z
);
maxz
=
max
(
maxz
,
apos
.
z
);
}
cSim
.
pGridBoundingBox
[
pos
]
=
make_float4
(
0
.
5
f
*
(
maxx
-
minx
),
0
.
5
f
*
(
maxy
-
miny
),
0
.
5
f
*
(
maxz
-
minz
),
0
);
cSim
.
pGridCenter
[
pos
]
=
make_float4
(
0
.
5
f
*
(
maxx
+
minx
),
0
.
5
f
*
(
maxy
+
miny
),
0
.
5
f
*
(
maxz
+
minz
),
0
);
}
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__global__
void
METHOD_NAME
(
kFindBlocksWithInteractions
,
_kernel
)()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
workUnits
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
cSim
.
pWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
);
x
=
(
x
>>
17
);
// Find the distance between the bounding boxes of the two cells.
float4
centera
=
cSim
.
pGridCenter
[
x
];
float4
centerb
=
cSim
.
pGridCenter
[
y
];
float
dx
=
centera
.
x
-
centerb
.
x
;
float
dy
=
centera
.
y
-
centerb
.
y
;
float
dz
=
centera
.
z
-
centerb
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
float4
boxSizea
=
cSim
.
pGridBoundingBox
[
x
];
float4
boxSizeb
=
cSim
.
pGridBoundingBox
[
y
];
dx
=
max
(
0
.
0
f
,
abs
(
dx
)
-
boxSizea
.
x
-
boxSizeb
.
x
);
dy
=
max
(
0
.
0
f
,
abs
(
dy
)
-
boxSizea
.
y
-
boxSizeb
.
y
);
dz
=
max
(
0
.
0
f
,
abs
(
dz
)
-
boxSizea
.
z
-
boxSizeb
.
z
);
cSim
.
pInteractionFlag
[
pos
]
=
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cSim
.
nonbondedCutoffSqr
?
0
:
1
);
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__global__
void
METHOD_NAME
(
kFindInteractionsWithinBlocks
,
_kernel
)(
unsigned
int
*
workUnit
)
{
extern
__shared__
volatile
unsigned
int
flags
[];
unsigned
int
totalWarps
=
cSim
.
nonbond_blocks
*
cSim
.
nonbond_threads_per_block
/
GRID
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
GRID
;
unsigned
int
numWorkUnits
=
cSim
.
pInteractionCount
[
0
];
unsigned
int
pos
=
warp
*
numWorkUnits
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numWorkUnits
/
totalWarps
;
unsigned
int
index
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
lasty
=
0xFFFFFFFF
;
float4
apos
;
while
(
pos
<
end
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
workUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
);
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
);
if
(
x
==
y
||
bExclusionFlag
)
{
// Assume this block will be dense.
if
(
index
==
0
)
cSim
.
pInteractionFlag
[
pos
]
=
0xFFFFFFFF
;
}
else
{
// Load the bounding box for x and the atom positions for y.
float4
center
=
cSim
.
pGridCenter
[
x
];
float4
boxSize
=
cSim
.
pGridBoundingBox
[
x
];
if
(
y
!=
lasty
)
{
apos
=
cSim
.
pPosq
[(
y
<<
GRIDBITS
)
+
index
];
}
// Find the distance of the atom from the bounding box.
float
dx
=
apos
.
x
-
center
.
x
;
float
dy
=
apos
.
y
-
center
.
y
;
float
dz
=
apos
.
z
-
center
.
z
;
#ifdef USE_PERIODIC
dx
-=
floorf
(
dx
*
cSim
.
invPeriodicBoxSizeX
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeX
;
dy
-=
floorf
(
dy
*
cSim
.
invPeriodicBoxSizeY
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeY
;
dz
-=
floorf
(
dz
*
cSim
.
invPeriodicBoxSizeZ
+
0
.
5
f
)
*
cSim
.
periodicBoxSizeZ
;
#endif
dx
=
max
(
0
.
0
f
,
abs
(
dx
)
-
boxSize
.
x
);
dy
=
max
(
0
.
0
f
,
abs
(
dy
)
-
boxSize
.
y
);
dz
=
max
(
0
.
0
f
,
abs
(
dz
)
-
boxSize
.
z
);
flags
[
threadIdx
.
x
]
=
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cSim
.
nonbondedCutoffSqr
?
0
:
1
<<
index
);
// Sum the flags.
if
(
index
%
2
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
1
];
if
(
index
%
4
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
2
];
if
(
index
%
8
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
4
];
if
(
index
%
16
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
8
];
if
(
index
==
0
)
{
unsigned
int
allFlags
=
flags
[
threadIdx
.
x
]
+
flags
[
threadIdx
.
x
+
16
];
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned
int
bits
=
(
allFlags
&
0x55555555
)
+
((
allFlags
>>
1
)
&
0x55555555
);
bits
=
(
bits
&
0x33333333
)
+
((
bits
>>
2
)
&
0x33333333
);
bits
=
(
bits
&
0x0F0F0F0F
)
+
((
bits
>>
4
)
&
0x0F0F0F0F
);
bits
=
(
bits
&
0x00FF00FF
)
+
((
bits
>>
8
)
&
0x00FF00FF
);
bits
=
(
bits
&
0x0000FFFF
)
+
((
bits
>>
16
)
&
0x0000FFFF
);
cSim
.
pInteractionFlag
[
pos
]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
);
}
lasty
=
y
;
}
pos
++
;
}
}
platforms/cuda-old/src/kernels/kForces.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define FABS(a) ((a) > 0.0f ? (a) : -(a))
static
__constant__
cudaGmxSimulation
cSim
;
void
OPENMMCUDA_EXPORT
SetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetForcesSim copy to cSim failed"
);
}
void
GetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed"
);
}
__global__
__launch_bounds__
(
384
,
1
)
void
kClearForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
outputBuffers
)
{
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
OPENMMCUDA_EXPORT
kClearForces
(
gpuContext
gpu
)
{
// printf("kClearForces\n");
kClearForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearForces"
);
}
__global__
__launch_bounds__
(
384
,
1
)
void
kClearBornSumAndForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
cSim
.
pBornSum
[
pos
]
=
0.0
f
;
cSim
.
pBornForce
[
pos
]
=
0.0
f
;
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
while
(
pos
<
cSim
.
stride
*
cSim
.
outputBuffers
)
{
cSim
.
pForce4
[
pos
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearBornSumAndForces
(
gpuContext
gpu
)
{
// printf("kClearBornSumAndForces\n");
kClearBornSumAndForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornSumAndForces"
);
}
__global__
__launch_bounds__
(
384
,
1
)
void
kClearEnergy_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
energyOutputBuffers
)
{
((
float
*
)
cSim
.
pEnergy
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearEnergy
(
gpuContext
gpu
)
{
// printf("kClearEnergy\n");
kClearEnergy_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearEnergy"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
kReduceBornSumAndForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
// Reduce Born Sum
while
(
pos
-
cSim
.
stride4
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
-
cSim
.
stride4
;
float2
atom
=
cSim
.
pObcData
[
pos
-
cSim
.
stride4
];
// Get summed Born data
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f3
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f4
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
sum
+=
*
pSt
;
}
// Now calculate Born radius and OBC term.
cSim
.
pBornSum
[
pos
-
cSim
.
stride4
]
=
sum
;
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
-
cSim
.
stride4
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
-
cSim
.
stride4
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceBornSumAndForces
(
gpuContext
gpu
)
{
fprintf
(
stderr
,
"kReduceBornSumAndForces
\n
"
);
kReduceBornSumAndForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceBornSumAndForces"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
kReduceForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
OPENMMCUDA_EXPORT
kReduceForces
(
gpuContext
gpu
)
{
// printf("kReduceForces\n");
kReduceForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceForces"
);
}
double
kReduceEnergy
(
gpuContext
gpu
)
{
//printf("kReduceEnergy\n");
gpu
->
psEnergy
->
Download
();
double
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
gpu
->
sim
.
energyOutputBuffers
);
i
++
){
sum
+=
(
*
gpu
->
psEnergy
)[
i
];
}
return
sum
;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
kReduceObcGbsaBornForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
float
energy
=
0.0
f
;
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
float2
obcData
=
cSim
.
pObcData
[
pos
];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
float
r
=
(
obcData
.
x
+
cSim
.
dielectricOffset
+
cSim
.
probeRadius
);
float
ratio6
=
powf
((
obcData
.
x
+
cSim
.
dielectricOffset
)
/
bornRadius
,
6.0
f
);
float
saTerm
=
cSim
.
surfaceAreaFactor
*
r
*
r
*
ratio6
;
totalForce
+=
saTerm
/
bornRadius
;
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
energy
+=
saTerm
;
pFt
=
cSim
.
pBornForce
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
// correct for surface area factor of -6
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
/
-
6.0
f
;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
kReduceGBVIBornForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
float
energy
=
0.0
f
;
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float4
gbviData
=
cSim
.
pGBVIData
[
pos
];
float
switchDeriv
=
cSim
.
pGBVISwitchDerivative
[
pos
];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
float
ratio
=
(
gbviData
.
x
/
bornRadius
);
float
ratio3
=
ratio
*
ratio
*
ratio
;
energy
-=
gbviData
.
z
*
ratio3
;
totalForce
+=
(
3.0
f
*
gbviData
.
z
*
ratio3
)
/
bornRadius
;
// 'cavity' term
float
br2
=
bornRadius
*
bornRadius
;
totalForce
*=
(
1.0
f
/
3.0
f
)
*
br2
*
br2
*
switchDeriv
;
pFt
=
cSim
.
pBornForce
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
)
{
if
(
gpu
->
bIncludeGBSA
){
kReduceObcGbsaBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceObcGbsaBornForces"
);
}
else
if
(
gpu
->
bIncludeGBVI
){
kReduceGBVIBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceGBVIBornForces"
);
}
}
platforms/cuda-old/src/kernels/kLangevinUpdate.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using
namespace
std
;
#include "gputypes.h"
enum
{
VelScale
,
ForceScale
,
NoiseScale
,
MaxParams
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetLangevinUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetLangevinUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels with and with center of mass motion removal.
#include "kLangevinUpdate.h"
#define REMOVE_CM
#include "kLangevinUpdate.h"
void
kLangevinUpdatePart1
(
gpuContext
gpu
)
{
// printf("kLangevinUpdatePart1\n");
if
(
gpu
->
bRemoveCM
)
{
kLangevinUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kLangevinUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
}
else
{
kLangevinUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kLangevinUpdatePart1"
);
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kLangevinUpdatePart2
(
gpuContext
gpu
)
{
// printf("kLangevinUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kLangevinUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kLangevinUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
}
else
{
kLangevinUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kLangevinUpdatePart2"
);
}
// Update randoms if necessary
gpu
->
iterations
++
;
if
(
gpu
->
iterations
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
gpu
->
iterations
=
0
;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
void
kSelectLangevinStepSize_kernel
(
float
maxStepSize
)
{
// Calculate the error.
extern
__shared__
float
error
[];
__shared__
float
params
[
MaxParams
];
error
[
threadIdx
.
x
]
=
0.0
f
;
unsigned
int
pos
=
threadIdx
.
x
;
while
(
pos
<
cSim
.
atoms
)
{
float4
force
=
cSim
.
pForce4
[
pos
];
float
invMass
=
cSim
.
pVelm4
[
pos
].
w
;
error
[
threadIdx
.
x
]
+=
(
force
.
x
*
force
.
x
+
force
.
y
*
force
.
y
+
force
.
z
*
force
.
z
)
*
invMass
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
__syncthreads
();
// Sum the errors from all threads.
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
error
[
threadIdx
.
x
]
+=
error
[
threadIdx
.
x
+
offset
];
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
// Select the new step size.
float
totalError
=
sqrt
(
error
[
0
]
/
(
cSim
.
atoms
*
3
));
float
newStepSize
=
sqrt
(
cSim
.
errorTol
/
totalError
);
float
oldStepSize
=
cSim
.
pStepSize
[
0
].
y
;
if
(
oldStepSize
>
0.0
f
)
newStepSize
=
min
(
newStepSize
,
oldStepSize
*
2.0
f
);
// For safety, limit how quickly dt can increase.
if
(
newStepSize
>
oldStepSize
&&
newStepSize
<
1.1
f
*
oldStepSize
)
newStepSize
=
oldStepSize
;
// Keeping dt constant between steps improves the behavior of the integrator.
if
(
newStepSize
>
maxStepSize
)
newStepSize
=
maxStepSize
;
cSim
.
pStepSize
[
0
].
y
=
newStepSize
;
// Recalculate the integration parameters.
float
vscale
=
exp
(
-
newStepSize
/
cSim
.
tau
);
float
fscale
=
(
1
-
vscale
)
*
cSim
.
tau
;
float
noisescale
=
sqrt
(
2
*
cSim
.
kT
/
cSim
.
tau
)
*
sqrt
(
0.5
f
*
(
1
-
vscale
*
vscale
)
*
cSim
.
tau
);
params
[
VelScale
]
=
vscale
;
params
[
ForceScale
]
=
fscale
;
params
[
NoiseScale
]
=
noisescale
;
}
__syncthreads
();
if
(
threadIdx
.
x
<
MaxParams
)
cSim
.
pLangevinParameters
[
threadIdx
.
x
]
=
params
[
threadIdx
.
x
];
}
void
kSelectLangevinStepSize
(
gpuContext
gpu
,
float
maxTimeStep
)
{
// printf("kSelectLangevinStepSize\n");
kSelectLangevinStepSize_kernel
<<<
1
,
gpu
->
sim
.
update_threads_per_block
,
sizeof
(
float
)
*
gpu
->
sim
.
update_threads_per_block
>>>
(
maxTimeStep
);
LAUNCHERROR
(
"kSelectLangevinStepSize"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
void
kSetVelocitiesFromPositions_kernel
()
{
float2
stepSize
=
cSim
.
pStepSize
[
0
];
double
oneOverDt
=
2.0
/
(
stepSize
.
x
+
stepSize
.
y
);
unsigned
int
pos
=
threadIdx
.
x
;
while
(
pos
<
cSim
.
atoms
)
{
float4
posq
=
cSim
.
pPosq
[
pos
];
float4
posqP
=
cSim
.
pPosqP
[
pos
];
float4
velm
=
cSim
.
pVelm4
[
pos
];
velm
.
x
=
(
float
)
(
oneOverDt
*
posqP
.
x
);
velm
.
y
=
(
float
)
(
oneOverDt
*
posqP
.
y
);
velm
.
z
=
(
float
)
(
oneOverDt
*
posqP
.
z
);
cSim
.
pVelm4
[
pos
]
=
velm
;
posq
.
x
+=
posqP
.
x
;
posq
.
y
+=
posqP
.
y
;
posq
.
z
+=
posqP
.
z
;
cSim
.
pPosq
[
pos
]
=
posq
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kSetVelocitiesFromPositions
(
gpuContext
gpu
)
{
// printf("kSetVelocitiesFromPositions\n");
kSetVelocitiesFromPositions_kernel
<<<
1
,
gpu
->
sim
.
update_threads_per_block
,
sizeof
(
float
)
*
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kSetVelocitiesFromPositions"
);
}
platforms/cuda-old/src/kernels/kLangevinUpdate.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
/**
* This file contains the kernels for Langevin integration. It is included
* several times in kLangevinUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
#ifdef REMOVE_CM
void
kLangevinUpdatePart1CM_kernel
()
#else
void
kLangevinUpdatePart1_kernel
()
#endif
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
float
vscale
=
cSim
.
pLangevinParameters
[
VelScale
];
float
fscale
=
cSim
.
pLangevinParameters
[
ForceScale
];
float
noisescale
=
cSim
.
pLangevinParameters
[
NoiseScale
];
#ifdef REMOVE_CM
extern
__shared__
float3
sCM
[];
float3
CM
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
float4
CM1
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
#endif
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
random4a
=
cSim
.
pRandom4
[
rpos
+
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
velocity
.
x
=
vscale
*
velocity
.
x
+
fscale
*
velocity
.
w
*
force
.
x
+
noisescale
*
sqrtInvMass
*
random4a
.
x
;
velocity
.
y
=
vscale
*
velocity
.
y
+
fscale
*
velocity
.
w
*
force
.
y
+
noisescale
*
sqrtInvMass
*
random4a
.
y
;
velocity
.
z
=
vscale
*
velocity
.
z
+
fscale
*
velocity
.
w
*
force
.
z
+
noisescale
*
sqrtInvMass
*
random4a
.
z
;
#ifdef REMOVE_CM
velocity
.
x
-=
sCM
[
0
].
x
;
velocity
.
y
-=
sCM
[
0
].
y
;
velocity
.
z
-=
sCM
[
0
].
z
;
#endif
cSim
.
pOldPosq
[
pos
]
=
cSim
.
pPosq
[
pos
];
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
#ifdef REMOVE_CM
void
kLangevinUpdatePart2CM_kernel
()
#else
void
kLangevinUpdatePart2_kernel
()
#endif
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__shared__
float
dt
;
if
(
threadIdx
.
x
==
0
)
{
dt
=
cSim
.
pStepSize
[
0
].
y
;
if
(
pos
==
0
)
cSim
.
pStepSize
[
0
].
x
=
dt
;
}
__syncthreads
();
#ifdef REMOVE_CM
extern
__shared__
float3
sCM
[];
float3
CM
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
__syncthreads
();
#endif
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
#ifdef REMOVE_CM
float
mass
=
1
.
0
f
/
velocity
.
w
;
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
#endif
float4
xPrime
=
make_float4
(
dt
*
velocity
.
x
,
dt
*
velocity
.
y
,
dt
*
velocity
.
z
,
cSim
.
pPosq
[
pos
].
w
);
cSim
.
pPosqP
[
pos
]
=
xPrime
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
#ifdef REMOVE_CM
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0
.
0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
#endif
}
platforms/cuda-old/src/kernels/kMonteCarloBarostat.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
__global__
void
kScaleAtomCoordinates_kernel
(
float
scale
,
int
numMolecules
,
float3
periodicBoxSize
,
float4
*
posq
,
int
*
moleculeAtoms
,
int
*
moleculeStartIndex
)
{
float3
invPeriodicBoxSize
=
make_float3
(
1.0
f
/
periodicBoxSize
.
x
,
1.0
f
/
periodicBoxSize
.
y
,
1.0
f
/
periodicBoxSize
.
z
);
for
(
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
index
<
numMolecules
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
first
=
moleculeStartIndex
[
index
];
int
last
=
moleculeStartIndex
[
index
+
1
];
int
numAtoms
=
last
-
first
;
// Find the center of each molecule.
float3
center
=
make_float3
(
0
,
0
,
0
);
for
(
int
atom
=
first
;
atom
<
last
;
atom
++
)
{
float4
pos
=
posq
[
moleculeAtoms
[
atom
]];
center
.
x
+=
pos
.
x
;
center
.
y
+=
pos
.
y
;
center
.
z
+=
pos
.
z
;
}
center
.
x
/=
(
float
)
numAtoms
;
center
.
y
/=
(
float
)
numAtoms
;
center
.
z
/=
(
float
)
numAtoms
;
// Move it into the first periodic box.
int
xcell
=
(
int
)
floorf
(
center
.
x
*
invPeriodicBoxSize
.
x
);
int
ycell
=
(
int
)
floorf
(
center
.
y
*
invPeriodicBoxSize
.
y
);
int
zcell
=
(
int
)
floorf
(
center
.
z
*
invPeriodicBoxSize
.
z
);
float3
delta
=
make_float3
(
xcell
*
periodicBoxSize
.
x
,
ycell
*
periodicBoxSize
.
y
,
zcell
*
periodicBoxSize
.
z
);
center
.
x
-=
delta
.
x
;
center
.
y
-=
delta
.
y
;
center
.
z
-=
delta
.
z
;
// Now scale the position of the molecule center.
delta
.
x
=
center
.
x
*
(
scale
-
1
)
-
delta
.
x
;
delta
.
y
=
center
.
y
*
(
scale
-
1
)
-
delta
.
y
;
delta
.
z
=
center
.
z
*
(
scale
-
1
)
-
delta
.
z
;
for
(
int
atom
=
first
;
atom
<
last
;
atom
++
)
{
float4
pos
=
posq
[
moleculeAtoms
[
atom
]];
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
posq
[
moleculeAtoms
[
atom
]]
=
pos
;
}
}
}
void
kScaleAtomCoordinates
(
gpuContext
gpu
,
float
scale
,
CUDAStream
<
int
>&
moleculeAtoms
,
CUDAStream
<
int
>&
moleculeStartIndex
)
{
// printf("kScaleAtomCoordinates\n");
kScaleAtomCoordinates_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
(
scale
,
moleculeStartIndex
.
_length
-
1
,
make_float3
(
gpu
->
sim
.
periodicBoxSizeX
,
gpu
->
sim
.
periodicBoxSizeY
,
gpu
->
sim
.
periodicBoxSizeZ
),
gpu
->
sim
.
pPosq
,
moleculeAtoms
.
_pDevData
,
moleculeStartIndex
.
_pDevData
);
LAUNCHERROR
(
"kScaleAtomCoordinates"
);
}
platforms/cuda-old/src/kernels/kRandom.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
extern
__shared__
float3
sRand
[];
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_RANDOM_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_RANDOM_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_RANDOM_THREADS_PER_BLOCK
,
1
)
#endif
void
kGenerateRandoms_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
increment
=
blockDim
.
x
*
gridDim
.
x
;
// Read generator state
uint4
state
=
cSim
.
pRandomSeed
[
pos
];
unsigned
int
carry
=
0
;
float4
random4
;
float2
random2
;
while
(
pos
<
cSim
.
totalRandoms
)
{
// Generate 6 randoms in GRF
unsigned
int
pos1
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
unsigned
int
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
unsigned
int
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x1
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrtf
(
-
2.0
f
*
logf
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
x
=
x1
*
cosf
(
2.0
f
*
3.14159265
f
*
x2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x3
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrtf
(
-
2.0
f
*
logf
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
y
=
x3
*
cosf
(
2.0
f
*
3.14159265
f
*
x4
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x5
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrtf
(
-
2.0
f
*
logf
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
sRand
[
pos1
].
z
=
x5
*
cosf
(
2.0
f
*
3.14159265
f
*
x6
);
pos1
+=
blockDim
.
x
;
}
// Output final randoms
random4
.
x
=
sRand
[
threadIdx
.
x
].
x
;
random4
.
y
=
sRand
[
threadIdx
.
x
].
y
;
random4
.
z
=
sRand
[
threadIdx
.
x
].
z
;
random4
.
w
=
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
x
;
cSim
.
pRandom4
[
pos
]
=
random4
;
random2
.
x
=
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
y
;
random2
.
y
=
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
z
;
cSim
.
pRandom2
[
pos
]
=
random2
;
pos
+=
increment
;
}
// Write generator state
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
cSim
.
pRandomSeed
[
pos
]
=
state
;
}
void
kGenerateRandoms
(
gpuContext
gpu
)
{
kGenerateRandoms_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
random_threads_per_block
,
gpu
->
sim
.
random_threads_per_block
*
2
*
sizeof
(
float3
)
>>>
();
}
platforms/cuda-old/src/kernels/kSettle.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetSettleSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetSettleSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
/**
* This is based on the setlep FORTRAN routine by Shuichi Miyamoto. See
* S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_SHAKE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_SHAKE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_SHAKE_THREADS_PER_BLOCK
,
1
)
#endif
void
kApplySettle_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
settleConstraints
)
{
// Load data.
int4
atomID
=
cSim
.
pSettleID
[
pos
];
float2
params
=
cSim
.
pSettleParameter
[
pos
];
float4
apos0
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xp0
=
cSim
.
pPosqP
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xp1
=
cSim
.
pPosqP
[
atomID
.
y
];
float4
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
float4
xp2
=
cSim
.
pPosqP
[
atomID
.
z
];
float
m0
=
1.0
f
/
cSim
.
pVelm4
[
atomID
.
x
].
w
;
float
m1
=
1.0
f
/
cSim
.
pVelm4
[
atomID
.
y
].
w
;
float
m2
=
1.0
f
/
cSim
.
pVelm4
[
atomID
.
z
].
w
;
// Translate the molecule to the origin to improve numerical precision.
float3
center
=
make_float3
(
apos0
.
x
,
apos0
.
y
,
apos0
.
z
);
apos0
.
x
-=
center
.
x
;
apos0
.
y
-=
center
.
y
;
apos0
.
z
-=
center
.
z
;
apos1
.
x
-=
center
.
x
;
apos1
.
y
-=
center
.
y
;
apos1
.
z
-=
center
.
z
;
apos2
.
x
-=
center
.
x
;
apos2
.
y
-=
center
.
y
;
apos2
.
z
-=
center
.
z
;
// Apply the SETTLE algorithm.
float
xb0
=
apos1
.
x
-
apos0
.
x
;
float
yb0
=
apos1
.
y
-
apos0
.
y
;
float
zb0
=
apos1
.
z
-
apos0
.
z
;
float
xc0
=
apos2
.
x
-
apos0
.
x
;
float
yc0
=
apos2
.
y
-
apos0
.
y
;
float
zc0
=
apos2
.
z
-
apos0
.
z
;
float
totalMass
=
m0
+
m1
+
m2
;
float
xcom
=
((
apos0
.
x
+
xp0
.
x
)
*
m0
+
(
apos1
.
x
+
xp1
.
x
)
*
m1
+
(
apos2
.
x
+
xp2
.
x
)
*
m2
)
/
totalMass
;
float
ycom
=
((
apos0
.
y
+
xp0
.
y
)
*
m0
+
(
apos1
.
y
+
xp1
.
y
)
*
m1
+
(
apos2
.
y
+
xp2
.
y
)
*
m2
)
/
totalMass
;
float
zcom
=
((
apos0
.
z
+
xp0
.
z
)
*
m0
+
(
apos1
.
z
+
xp1
.
z
)
*
m1
+
(
apos2
.
z
+
xp2
.
z
)
*
m2
)
/
totalMass
;
float
xa1
=
apos0
.
x
+
xp0
.
x
-
xcom
;
float
ya1
=
apos0
.
y
+
xp0
.
y
-
ycom
;
float
za1
=
apos0
.
z
+
xp0
.
z
-
zcom
;
float
xb1
=
apos1
.
x
+
xp1
.
x
-
xcom
;
float
yb1
=
apos1
.
y
+
xp1
.
y
-
ycom
;
float
zb1
=
apos1
.
z
+
xp1
.
z
-
zcom
;
float
xc1
=
apos2
.
x
+
xp2
.
x
-
xcom
;
float
yc1
=
apos2
.
y
+
xp2
.
y
-
ycom
;
float
zc1
=
apos2
.
z
+
xp2
.
z
-
zcom
;
float
xaksZd
=
yb0
*
zc0
-
zb0
*
yc0
;
float
yaksZd
=
zb0
*
xc0
-
xb0
*
zc0
;
float
zaksZd
=
xb0
*
yc0
-
yb0
*
xc0
;
float
xaksXd
=
ya1
*
zaksZd
-
za1
*
yaksZd
;
float
yaksXd
=
za1
*
xaksZd
-
xa1
*
zaksZd
;
float
zaksXd
=
xa1
*
yaksZd
-
ya1
*
xaksZd
;
float
xaksYd
=
yaksZd
*
zaksXd
-
zaksZd
*
yaksXd
;
float
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
float
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
float
axlng
=
sqrtf
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
float
aylng
=
sqrtf
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
float
azlng
=
sqrtf
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
float
trns11
=
xaksXd
/
axlng
;
float
trns21
=
yaksXd
/
axlng
;
float
trns31
=
zaksXd
/
axlng
;
float
trns12
=
xaksYd
/
aylng
;
float
trns22
=
yaksYd
/
aylng
;
float
trns32
=
zaksYd
/
aylng
;
float
trns13
=
xaksZd
/
azlng
;
float
trns23
=
yaksZd
/
azlng
;
float
trns33
=
zaksZd
/
azlng
;
float
xb0d
=
trns11
*
xb0
+
trns21
*
yb0
+
trns31
*
zb0
;
float
yb0d
=
trns12
*
xb0
+
trns22
*
yb0
+
trns32
*
zb0
;
float
xc0d
=
trns11
*
xc0
+
trns21
*
yc0
+
trns31
*
zc0
;
float
yc0d
=
trns12
*
xc0
+
trns22
*
yc0
+
trns32
*
zc0
;
float
za1d
=
trns13
*
xa1
+
trns23
*
ya1
+
trns33
*
za1
;
float
xb1d
=
trns11
*
xb1
+
trns21
*
yb1
+
trns31
*
zb1
;
float
yb1d
=
trns12
*
xb1
+
trns22
*
yb1
+
trns32
*
zb1
;
float
zb1d
=
trns13
*
xb1
+
trns23
*
yb1
+
trns33
*
zb1
;
float
xc1d
=
trns11
*
xc1
+
trns21
*
yc1
+
trns31
*
zc1
;
float
yc1d
=
trns12
*
xc1
+
trns22
*
yc1
+
trns32
*
zc1
;
float
zc1d
=
trns13
*
xc1
+
trns23
*
yc1
+
trns33
*
zc1
;
// --- Step2 A2' ---
float
rc
=
0.5
f
*
params
.
y
;
float
rb
=
sqrtf
(
params
.
x
*
params
.
x
-
rc
*
rc
);
float
ra
=
rb
*
(
m1
+
m2
)
/
totalMass
;
rb
-=
ra
;
float
sinphi
=
za1d
/
ra
;
float
cosphi
=
sqrtf
(
1.0
f
-
sinphi
*
sinphi
);
float
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
float
cospsi
=
sqrtf
(
1.0
f
-
sinpsi
*
sinpsi
);
float
ya2d
=
ra
*
cosphi
;
float
xb2d
=
-
rc
*
cospsi
;
float
yb2d
=
-
rb
*
cosphi
-
rc
*
sinpsi
*
sinphi
;
float
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
float
xb2d2
=
xb2d
*
xb2d
;
float
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
float
deltx
=
2.0
f
*
xb2d
+
sqrtf
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
xb2d
-=
deltx
*
0.5
f
;
// --- Step3 al,be,ga ---
float
alpa
=
(
xb2d
*
(
xb0d
-
xc0d
)
+
yb0d
*
yb2d
+
yc0d
*
yc2d
);
float
beta
=
(
xb2d
*
(
yc0d
-
yb0d
)
+
xb0d
*
yb2d
+
xc0d
*
yc2d
);
float
gama
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
float
al2be2
=
alpa
*
alpa
+
beta
*
beta
;
float
sinthe
=
(
alpa
*
gama
-
beta
*
sqrtf
(
al2be2
-
gama
*
gama
)
)
/
al2be2
;
// --- Step4 A3' ---
float
costhe
=
sqrtf
(
1.0
f
-
sinthe
*
sinthe
);
float
xa3d
=
-
ya2d
*
sinthe
;
float
ya3d
=
ya2d
*
costhe
;
float
za3d
=
za1d
;
float
xb3d
=
xb2d
*
costhe
-
yb2d
*
sinthe
;
float
yb3d
=
xb2d
*
sinthe
+
yb2d
*
costhe
;
float
zb3d
=
zb1d
;
float
xc3d
=
-
xb2d
*
costhe
-
yc2d
*
sinthe
;
float
yc3d
=
-
xb2d
*
sinthe
+
yc2d
*
costhe
;
float
zc3d
=
zc1d
;
// --- Step5 A3 ---
float
xa3
=
trns11
*
xa3d
+
trns12
*
ya3d
+
trns13
*
za3d
;
float
ya3
=
trns21
*
xa3d
+
trns22
*
ya3d
+
trns23
*
za3d
;
float
za3
=
trns31
*
xa3d
+
trns32
*
ya3d
+
trns33
*
za3d
;
float
xb3
=
trns11
*
xb3d
+
trns12
*
yb3d
+
trns13
*
zb3d
;
float
yb3
=
trns21
*
xb3d
+
trns22
*
yb3d
+
trns23
*
zb3d
;
float
zb3
=
trns31
*
xb3d
+
trns32
*
yb3d
+
trns33
*
zb3d
;
float
xc3
=
trns11
*
xc3d
+
trns12
*
yc3d
+
trns13
*
zc3d
;
float
yc3
=
trns21
*
xc3d
+
trns22
*
yc3d
+
trns23
*
zc3d
;
float
zc3
=
trns31
*
xc3d
+
trns32
*
yc3d
+
trns33
*
zc3d
;
xp0
.
x
=
xcom
+
xa3
-
apos0
.
x
;
xp0
.
y
=
ycom
+
ya3
-
apos0
.
y
;
xp0
.
z
=
zcom
+
za3
-
apos0
.
z
;
xp1
.
x
=
xcom
+
xb3
-
apos1
.
x
;
xp1
.
y
=
ycom
+
yb3
-
apos1
.
y
;
xp1
.
z
=
zcom
+
zb3
-
apos1
.
z
;
xp2
.
x
=
xcom
+
xc3
-
apos2
.
x
;
xp2
.
y
=
ycom
+
yc3
-
apos2
.
y
;
xp2
.
z
=
zcom
+
zc3
-
apos2
.
z
;
cSim
.
pPosqP
[
atomID
.
x
]
=
xp0
;
cSim
.
pPosqP
[
atomID
.
y
]
=
xp1
;
cSim
.
pPosqP
[
atomID
.
z
]
=
xp2
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kApplySettle
(
gpuContext
gpu
)
{
// printf("kApplySettle\n");
if
(
gpu
->
sim
.
settleConstraints
>
0
)
{
kApplySettle_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
settle_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplySettle"
);
}
}
platforms/cuda-old/src/kernels/kShakeH.cu
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float3
rij1
;
float3
rij2
;
float3
rij3
;
float
M
;
float
d2
;
float
InvMassI
;
float
rij1sq
;
float
rij2sq
;
float
rij3sq
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_SHAKE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_SHAKE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_SHAKE_THREADS_PER_BLOCK
,
1
)
#endif
void
kApplyShake_kernel
()
{
extern
__shared__
Atom
sA
[];
Atom
*
psA
=
&
sA
[
threadIdx
.
x
];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ShakeConstraints
)
{
int4
atomID
=
cSim
.
pShakeID
[
pos
];
float4
params
=
cSim
.
pShakeParameter
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xpi
=
cSim
.
pPosqP
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xpj1
=
cSim
.
pPosqP
[
atomID
.
y
];
float4
apos2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
psA
->
InvMassI
=
params
.
x
;
psA
->
M
=
params
.
y
;
psA
->
d2
=
params
.
z
;
float
invMassJ
=
params
.
w
;
if
(
atomID
.
z
!=
-
1
)
{
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
xpj2
=
cSim
.
pPosqP
[
atomID
.
z
];
}
float4
apos3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atomID
.
w
!=
-
1
)
{
apos3
=
cSim
.
pOldPosq
[
atomID
.
w
];
xpj3
=
cSim
.
pPosqP
[
atomID
.
w
];
}
float3
xi
,
xj1
,
xj2
,
xj3
;
xi
.
x
=
apos
.
x
;
xi
.
y
=
apos
.
y
;
xi
.
z
=
apos
.
z
;
xj1
.
x
=
apos1
.
x
;
xj1
.
y
=
apos1
.
y
;
xj1
.
z
=
apos1
.
z
;
xj2
.
x
=
apos2
.
x
;
xj2
.
y
=
apos2
.
y
;
xj2
.
z
=
apos2
.
z
;
xj3
.
x
=
apos3
.
x
;
xj3
.
y
=
apos3
.
y
;
xj3
.
z
=
apos3
.
z
;
psA
->
rij1
.
x
=
xi
.
x
-
xj1
.
x
;
psA
->
rij1
.
y
=
xi
.
y
-
xj1
.
y
;
psA
->
rij1
.
z
=
xi
.
z
-
xj1
.
z
;
psA
->
rij2
.
x
=
xi
.
x
-
xj2
.
x
;
psA
->
rij2
.
y
=
xi
.
y
-
xj2
.
y
;
psA
->
rij2
.
z
=
xi
.
z
-
xj2
.
z
;
psA
->
rij3
.
x
=
xi
.
x
-
xj3
.
x
;
psA
->
rij3
.
y
=
xi
.
y
-
xj3
.
y
;
psA
->
rij3
.
z
=
xi
.
z
-
xj3
.
z
;
psA
->
rij1sq
=
psA
->
rij1
.
x
*
psA
->
rij1
.
x
+
psA
->
rij1
.
y
*
psA
->
rij1
.
y
+
psA
->
rij1
.
z
*
psA
->
rij1
.
z
;
psA
->
rij2sq
=
psA
->
rij2
.
x
*
psA
->
rij2
.
x
+
psA
->
rij2
.
y
*
psA
->
rij2
.
y
+
psA
->
rij2
.
z
*
psA
->
rij2
.
z
;
psA
->
rij3sq
=
psA
->
rij3
.
x
*
psA
->
rij3
.
x
+
psA
->
rij3
.
y
*
psA
->
rij3
.
y
+
psA
->
rij3
.
z
*
psA
->
rij3
.
z
;
float
ld1
=
psA
->
d2
-
psA
->
rij1sq
;
float
ld2
=
psA
->
d2
-
psA
->
rij2sq
;
float
ld3
=
psA
->
d2
-
psA
->
rij3sq
;
bool
converged
=
false
;
int
iteration
=
0
;
while
(
iteration
<
15
&&
!
converged
)
{
converged
=
true
;
float3
rpij
;
rpij
.
x
=
xpi
.
x
-
xpj1
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj1
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj1
.
z
;
float
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
float
rrpr
=
psA
->
rij1
.
x
*
rpij
.
x
+
psA
->
rij1
.
y
*
rpij
.
y
+
psA
->
rij1
.
z
*
rpij
.
z
;
float
diff
=
fabs
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij1sq
);
float3
dr
;
dr
.
x
=
psA
->
rij1
.
x
*
acor
;
dr
.
y
=
psA
->
rij1
.
y
*
acor
;
dr
.
z
=
psA
->
rij1
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj1
.
x
-=
dr
.
x
*
invMassJ
;
xpj1
.
y
-=
dr
.
y
*
invMassJ
;
xpj1
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
if
(
atomID
.
z
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj2
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj2
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj2
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij2
.
x
*
rpij
.
x
+
psA
->
rij2
.
y
*
rpij
.
y
+
psA
->
rij2
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij2sq
);
float3
dr
;
dr
.
x
=
psA
->
rij2
.
x
*
acor
;
dr
.
y
=
psA
->
rij2
.
y
*
acor
;
dr
.
z
=
psA
->
rij2
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj2
.
x
-=
dr
.
x
*
invMassJ
;
xpj2
.
y
-=
dr
.
y
*
invMassJ
;
xpj2
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
if
(
atomID
.
w
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj3
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj3
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj3
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij3
.
x
*
rpij
.
x
+
psA
->
rij3
.
y
*
rpij
.
y
+
psA
->
rij3
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij3sq
);
float3
dr
;
dr
.
x
=
psA
->
rij3
.
x
*
acor
;
dr
.
y
=
psA
->
rij3
.
y
*
acor
;
dr
.
z
=
psA
->
rij3
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj3
.
x
-=
dr
.
x
*
invMassJ
;
xpj3
.
y
-=
dr
.
y
*
invMassJ
;
xpj3
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
iteration
++
;
}
cSim
.
pPosqP
[
atomID
.
x
]
=
xpi
;
cSim
.
pPosqP
[
atomID
.
y
]
=
xpj1
;
if
(
atomID
.
z
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
z
]
=
xpj2
;
if
(
atomID
.
w
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
w
]
=
xpj3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kApplyShake
(
gpuContext
gpu
)
{
// printf("kApplyShake\n");
if
(
gpu
->
sim
.
ShakeConstraints
>
0
)
{
kApplyShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
shake_threads_per_block
,
sizeof
(
Atom
)
*
gpu
->
sim
.
shake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplyShake"
);
}
}
platforms/cuda-old/src/kernels/kVerletUpdate.cu
deleted
100755 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
// Include versions of the kernels with and with center of mass motion removal.
#include "kVerletUpdate.h"
#define REMOVE_CM
#include "kVerletUpdate.h"
void
kVerletUpdatePart1
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart1\n");
if
(
gpu
->
bRemoveCM
)
{
kVerletUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
}
else
{
kVerletUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1"
);
}
}
void
kVerletUpdatePart2
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kVerletUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
}
else
{
kVerletUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2"
);
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
void
kSelectVerletStepSize_kernel
(
float
maxStepSize
)
{
// Calculate the error.
extern
__shared__
float
error
[];
error
[
threadIdx
.
x
]
=
0.0
f
;
unsigned
int
pos
=
threadIdx
.
x
;
while
(
pos
<
cSim
.
atoms
)
{
float4
force
=
cSim
.
pForce4
[
pos
];
float
invMass
=
cSim
.
pVelm4
[
pos
].
w
;
error
[
threadIdx
.
x
]
+=
(
force
.
x
*
force
.
x
+
force
.
y
*
force
.
y
+
force
.
z
*
force
.
z
)
*
invMass
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
__syncthreads
();
// Sum the errors from all threads.
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
error
[
threadIdx
.
x
]
+=
error
[
threadIdx
.
x
+
offset
];
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float
totalError
=
sqrt
(
error
[
0
]
/
(
cSim
.
atoms
*
3
));
float
newStepSize
=
sqrt
(
cSim
.
errorTol
/
totalError
);
float
oldStepSize
=
cSim
.
pStepSize
[
0
].
y
;
if
(
oldStepSize
>
0.0
f
)
newStepSize
=
min
(
newStepSize
,
oldStepSize
*
2.0
f
);
// For safety, limit how quickly dt can increase.
if
(
newStepSize
>
oldStepSize
&&
newStepSize
<
1.1
f
*
oldStepSize
)
newStepSize
=
oldStepSize
;
// Keeping dt constant between steps improves the behavior of the integrator.
if
(
newStepSize
>
maxStepSize
)
newStepSize
=
maxStepSize
;
cSim
.
pStepSize
[
0
].
y
=
newStepSize
;
}
}
void
kSelectVerletStepSize
(
gpuContext
gpu
,
float
maxTimeStep
)
{
// printf("kSelectVerletStepSize\n");
kSelectVerletStepSize_kernel
<<<
1
,
gpu
->
sim
.
update_threads_per_block
,
sizeof
(
float
)
*
gpu
->
sim
.
update_threads_per_block
>>>
(
maxTimeStep
);
LAUNCHERROR
(
"kSelectVerletStepSize"
);
}
platforms/cuda-old/src/kernels/kVerletUpdate.h
deleted
100644 → 0
View file @
352e2fc7
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for Verlet integration. It is included
* several times in kVerletUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
#ifdef REMOVE_CM
void
kVerletUpdatePart1CM_kernel
()
#else
void
kVerletUpdatePart1_kernel
()
#endif
{
// Load the step size to take.
__shared__
volatile
float
dtPos
;
__shared__
volatile
float
dtVel
;
if
(
threadIdx
.
x
==
0
)
{
float2
stepSize
=
cSim
.
pStepSize
[
0
];
dtPos
=
stepSize
.
y
;
dtVel
=
0
.
5
f
*
(
stepSize
.
x
+
stepSize
.
y
);
}
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
#ifdef REMOVE_CM
extern
__shared__
float3
sCM
[];
float3
CM
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
float4
CM1
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
#else
__syncthreads
();
#endif
while
(
pos
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
dtOverMass
=
dtVel
*
velocity
.
w
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
velocity
.
x
+=
dtOverMass
*
force
.
x
;
velocity
.
y
+=
dtOverMass
*
force
.
y
;
velocity
.
z
+=
dtOverMass
*
force
.
z
;
#ifdef REMOVE_CM
velocity
.
x
-=
sCM
[
0
].
x
;
velocity
.
y
-=
sCM
[
0
].
y
;
velocity
.
z
-=
sCM
[
0
].
z
;
#endif
apos
.
x
=
velocity
.
x
*
dtPos
;
apos
.
y
=
velocity
.
y
*
dtPos
;
apos
.
z
=
velocity
.
z
*
dtPos
;
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
GT2XX_UPDATE_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_UPDATE_THREADS_PER_BLOCK
,
1
)
#endif
#ifdef REMOVE_CM
void
kVerletUpdatePart2CM_kernel
()
#else
void
kVerletUpdatePart2_kernel
()
#endif
{
// Load the step size to take.
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__shared__
double
oneOverDeltaT
;
if
(
threadIdx
.
x
==
0
)
{
float
dt
=
cSim
.
pStepSize
[
0
].
y
;
oneOverDeltaT
=
1
.
0
/
dt
;
if
(
pos
==
0
)
cSim
.
pStepSize
[
0
].
x
=
dt
;
}
__syncthreads
();
#ifdef REMOVE_CM
extern
__shared__
float3
sCM
[];
float3
CM
=
{
0
.
0
f
,
0
.
0
f
,
0
.
0
f
};
#endif
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
velocity
.
x
=
(
float
)
(
oneOverDeltaT
*
(
double
)
xPrime
.
x
);
velocity
.
y
=
(
float
)
(
oneOverDeltaT
*
(
double
)
xPrime
.
y
);
velocity
.
z
=
(
float
)
(
oneOverDeltaT
*
(
double
)
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#ifdef REMOVE_CM
float
mass
=
1
.
0
f
/
velocity
.
w
;
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
#endif
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
#ifdef REMOVE_CM
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0
.
0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
#endif
}
platforms/cuda-old/src/kernels/rng.cpp
deleted
100644 → 0
View file @
352e2fc7
//________________________________________________________________________
// See the header file rng.h for a description of the contents of this
// file as well as references and credits.
#include "rng.h"
#include <cmath>
#include <iostream>
inline
ulong32
ULONG32
(
slong
x
)
{
return
(
ulong32
(
x
));
}
inline
ulong32
ULONG32
(
ulong32
x
)
{
return
(
ulong32
(
x
));
}
inline
ulong32
ULONG32
(
double
x
)
{
return
(
ulong32
(
x
));
}
static
const
double
PI
=
3.1415926535897932
;
//________________________________________________________________________
// Initialize the static component of RNG
ulong32
RNG
::
tm
=
1234567
;
ulong32
RNG
::
kn
[
128
],
RNG
::
ke
[
256
];
double
RNG
::
wn
[
128
],
RNG
::
fn
[
128
],
RNG
::
we
[
256
],
RNG
::
fe
[
256
];
//________________________________________________________________________
// RNG::RNOR generates normal variates with rejection.
// nfix() generates variates after rejection in RNOR.
// Despite rejection, this method is faster than Box-Muller.
double
RNG
::
nfix
(
slong
h
,
ulong32
i
)
{
const
double
r
=
3.442620
;
// The starting of the right tail
double
x
,
y
;
for
(;;)
{
x
=
h
*
wn
[
i
];
// If i == 0, handle the base strip
if
(
i
==
0
)
{
do
{
x
=
-
log
(
rand_open01
())
*
0.2904764
;
// .2904764 is 1/r
y
=
-
log
(
rand_open01
());
}
while
(
y
+
y
<
x
*
x
);
return
((
h
>
0
)
?
r
+
x
:
-
r
-
x
);
}
// If i > 0, handle the wedges of other strips
if
(
fn
[
i
]
+
rand_open01
()
*
(
fn
[
i
-
1
]
-
fn
[
i
])
<
exp
(
-
.5
*
x
*
x
))
return
x
;
// start all over
h
=
UL32toSL32
(
rand_int32
());
i
=
h
&
127
;
if
(
ULONG32
(
std
::
abs
(
h
))
<
kn
[
i
])
return
(
h
*
wn
[
i
]);
}
}
// RNG::nfix
// __________________________________________________________________________
// RNG::REXP generates exponential variates with rejection.
// efix() generates variates after rejection in REXP.
double
RNG
::
efix
(
ulong32
j
,
ulong32
i
)
{
for
(;;)
{
if
(
i
==
0
)
return
(
7.69711
-
log
(
rand_open01
()));
const
double
x
=
j
*
we
[
i
];
if
(
fe
[
i
]
+
rand_open01
()
*
(
fe
[
i
-
1
]
-
fe
[
i
])
<
exp
(
-
x
))
return
x
;
j
=
rand_int32
();
i
=
(
j
&
255
);
if
(
j
<
ke
[
i
])
return
(
j
*
we
[
i
]);
}
}
// RNG::efix
// __________________________________________________________________________
// This procedure creates the tables used by RNOR and REXP
void
RNG
::
zigset
()
{
static
bool
inited
=
0
;
if
(
inited
)
return
;
inited
=
1
;
// Set up tables for RNOR
const
double
m1
=
2147483648.0
;
// 2^31
const
double
vn
=
9.91256303526217e-3
;
double
tn
=
3.442619855899
;
double
q
=
vn
/
exp
(
-
.5
*
tn
*
tn
);
kn
[
0
]
=
ULONG32
((
tn
/
q
)
*
m1
);
kn
[
1
]
=
0
;
wn
[
0
]
=
q
/
m1
;
wn
[
127
]
=
tn
/
m1
;
fn
[
0
]
=
1.
;
fn
[
127
]
=
exp
(
-
.5
*
tn
*
tn
);
for
(
uint
i
=
126
;
i
>
0
;
i
--
)
{
const
double
dn
=
sqrtf
(
-
2
*
log
(
vn
/
tn
+
exp
(
-
.5
*
tn
*
tn
)));
kn
[
i
+
1
]
=
ULONG32
((
dn
/
tn
)
*
m1
);
fn
[
i
]
=
exp
(
-
.5
*
dn
*
dn
);
wn
[
i
]
=
dn
/
m1
;
tn
=
dn
;
}
// Set up tables for REXP
const
double
m2
=
4294967296.0
;
// 2^32
const
double
ve
=
3.949659822581572e-3
;
double
te
=
7.697117470131487
;
q
=
ve
/
exp
(
-
te
);
ke
[
0
]
=
ULONG32
((
te
/
q
)
*
m2
);
ke
[
1
]
=
0
;
we
[
0
]
=
q
/
m2
;
we
[
255
]
=
te
/
m2
;
fe
[
0
]
=
1.
;
fe
[
255
]
=
exp
(
-
te
);
for
(
uint
i
=
254
;
i
>
0
;
i
--
)
{
const
double
de
=
-
log
(
ve
/
te
+
exp
(
-
te
));
ke
[
i
+
1
]
=
ULONG32
((
de
/
te
)
*
m2
);
fe
[
i
]
=
exp
(
-
de
);
we
[
i
]
=
de
/
m2
;
te
=
de
;
}
}
// RNG::zigset
// __________________________________________________________________________
// Generate a gamma variate with parameters 'shape' and 'scale'
double
RNG
::
gamma
(
double
shape
,
double
scale
)
{
if
(
shape
<
1.
)
return
gamma
(
shape
+
1.
,
scale
)
*
pow
(
rand_open01
(),
1.0
/
shape
);
const
double
d
=
shape
-
1.
/
3.
;
const
double
c
=
1.
/
sqrtf
(
9.
*
d
);
double
x
,
v
;
for
(;;)
{
do
{
x
=
RNOR
();
v
=
1.0
+
c
*
x
;
}
while
(
v
<=
0.0
);
v
=
v
*
v
*
v
;
const
double
u
=
rand_open01
();
const
double
x2
=
x
*
x
;
if
(
u
<
1.0
-
0.0331
*
x2
*
x2
)
return
(
d
*
v
/
scale
);
if
(
log
(
u
)
<
0.5
*
x2
+
d
*
(
1.0
-
v
+
log
(
v
)))
return
(
d
*
v
/
scale
);
}
}
// RNG::gamma
// __________________________________________________________________________
// Generate a Poisson variate
// Code essentially copied from R source that is essentially
// ACM Algorithm 599 KPOISS converted to C.
int
RNG
::
poisson
(
double
mu
)
{
const
double
a0
=-
0.5
,
a1
=
0.3333333
,
a2
=-
0.2500068
,
a3
=
0.2000118
,
a4
=-
0.1661269
,
a5
=
0.1421878
,
a6
=-
0.1384794
,
a7
=
0.1250060
;
const
double
one_7
=
1.0
/
7.0
,
one_12
=
1.0
/
12.0
,
one_24
=
1.0
/
24.0
;
const
double
fact
[
10
]
=
{
1.
,
1.
,
2.
,
6.
,
24.
,
120.
,
720.
,
5040.
,
40320.
,
362880.
};
static
int
l
,
m
;
static
double
b1
,
b2
,
c
,
c0
,
c1
,
c2
,
c3
;
static
double
pp
[
36
],
p0
,
p
,
q
,
s
,
d
,
omega
;
static
double
big_l
;
/* integer "w/o overflow" */
static
double
muprev
=
0.
,
muprev2
=
0.
;
/*, muold = 0.*/
double
del
,
difmuk
=
0.
,
E
=
0.
,
fk
=
0.
,
fx
,
fy
,
g
,
px
,
py
,
t
,
u
=
0.
,
v
,
x
;
int
pois
=
-
1
;
int
k
,
kflag
,
big_mu
,
new_big_mu
=
0
;
if
(
mu
<=
0.
)
return
0
;
big_mu
=
mu
>=
10.
;
if
(
big_mu
)
new_big_mu
=
0
;
if
(
!
(
big_mu
&&
mu
==
muprev
))
{
if
(
big_mu
)
{
new_big_mu
=
1
;
muprev
=
mu
;
s
=
sqrtf
(
mu
);
d
=
6.
*
mu
*
mu
;
big_l
=
floor
(
mu
-
1.1484
);
}
else
{
if
(
mu
!=
muprev
)
{
muprev
=
mu
;
m
=
(
int
)
((
1.0
<
mu
)
?
mu
:
1.0
);
l
=
0
;
/* pp[] is already ok up to pp[l] */
q
=
p0
=
p
=
exp
(
-
mu
);
}
for
(;;)
{
u
=
rand_open01
();
if
(
u
<=
p0
)
return
0
;
if
(
l
!=
0
)
{
for
(
k
=
(
u
<=
0.458
)
?
1
:
((
l
<
m
)
?
l
:
m
);
k
<=
l
;
k
++
)
if
(
u
<=
pp
[
k
])
return
k
;
if
(
l
==
35
)
/* u > pp[35] */
continue
;
}
l
++
;
for
(
k
=
l
;
k
<=
35
;
k
++
)
{
p
*=
mu
/
k
;
q
+=
p
;
pp
[
k
]
=
q
;
if
(
u
<=
q
)
{
l
=
k
;
return
k
;
}
}
l
=
35
;
}
}
}
g
=
mu
+
s
*
RNOR
();
if
(
g
>=
0.
)
{
pois
=
int
(
g
);
if
(
pois
>=
big_l
)
return
pois
;
fk
=
pois
;
difmuk
=
mu
-
fk
;
u
=
rand_open01
();
if
(
d
*
u
>=
difmuk
*
difmuk
*
difmuk
)
return
pois
;
}
if
(
new_big_mu
||
mu
!=
muprev2
)
{
muprev2
=
mu
;
omega
=
1.0
/
(
sqrtf
(
2.0
*
PI
)
*
s
);
b1
=
one_24
/
mu
;
b2
=
0.3
*
b1
*
b1
;
c3
=
one_7
*
b1
*
b2
;
c2
=
b2
-
15.
*
c3
;
c1
=
b1
-
6.
*
b2
+
45.
*
c3
;
c0
=
1.
-
b1
+
3.
*
b2
-
15.
*
c3
;
c
=
0.1069
/
mu
;
}
if
(
g
>=
0.
)
{
kflag
=
0
;
goto
Step_F
;
}
for
(;;)
{
E
=
REXP
();
u
=
2
*
rand_open01
()
-
1.
;
t
=
1.8
+
((
u
>
0
)
?
std
::
abs
(
E
)
:
-
std
::
abs
(
E
));
if
(
t
>
-
0.6744
)
{
pois
=
int
(
mu
+
s
*
t
);
fk
=
pois
;
difmuk
=
mu
-
fk
;
kflag
=
1
;
Step_F:
if
(
pois
<
10
)
{
px
=
-
mu
;
py
=
pow
(
mu
,
pois
)
/
fact
[
pois
];
}
else
{
del
=
one_12
/
fk
;
del
=
del
*
(
1.
-
4.8
*
del
*
del
);
v
=
difmuk
/
fk
;
if
(
std
::
abs
(
v
)
<=
0.25
)
px
=
fk
*
v
*
v
*
(((((((
a7
*
v
+
a6
)
*
v
+
a5
)
*
v
+
a4
)
*
v
+
a3
)
*
v
+
a2
)
*
v
+
a1
)
*
v
+
a0
)
-
del
;
else
px
=
fk
*
log
(
1.
+
v
)
-
difmuk
-
del
;
py
=
1.0
/
(
sqrtf
(
2.0
*
PI
)
*
sqrtf
(
fk
));
}
x
=
(
0.5
-
difmuk
)
/
s
;
x
*=
x
;
/* x^2 */
fx
=
-
0.5
*
x
;
fy
=
omega
*
(((
c3
*
x
+
c2
)
*
x
+
c1
)
*
x
+
c0
);
if
(
kflag
>
0
)
{
if
(
c
*
std
::
abs
(
u
)
<=
py
*
exp
(
px
+
E
)
-
fy
*
exp
(
fx
+
E
))
break
;
}
else
if
(
fy
-
u
*
fy
<=
py
*
exp
(
px
-
fx
))
break
;
}
}
return
pois
;
}
// RNG::poisson
// __________________________________________________________________________
// Generate a binomial variate
// Code essentially copied from R source that is essentially
// ACM Algorithm 678 BTPEC converted to C.
int
RNG
::
binomial
(
double
pp
,
int
n
)
{
static
double
c
,
fm
,
npq
,
p1
,
p2
,
p3
,
p4
,
qn
,
xl
,
xll
,
xlr
,
xm
,
xr
;
static
double
psave
=
-
1.0
;
static
int
nsave
=
-
1
,
m
=
0
;
double
f
,
x
;
if
(
n
<=
0
||
pp
<=
0.
)
return
0
;
if
(
pp
>=
1.
)
return
n
;
const
double
p
=
(
pp
<
1.
-
pp
)
?
pp
:
1.
-
pp
;
const
double
q
=
1.
-
p
;
const
double
np
=
n
*
p
;
const
double
r
=
p
/
q
;
const
double
g
=
r
*
(
n
+
1
);
if
(
pp
!=
psave
||
n
!=
nsave
)
{
psave
=
pp
;
nsave
=
n
;
if
(
np
<
30.0
)
{
qn
=
pow
(
q
,
(
double
)
n
);
goto
L_np_small
;
}
else
{
double
ffm
=
np
+
p
;
m
=
(
int
)
ffm
;
fm
=
m
;
npq
=
np
*
q
;
p1
=
(
int
)(
2.195
*
sqrtf
(
npq
)
-
4.6
*
q
)
+
0.5
;
xm
=
fm
+
0.5
;
xl
=
xm
-
p1
;
xr
=
xm
+
p1
;
c
=
0.134
+
20.5
/
(
15.3
+
fm
);
double
al
=
(
ffm
-
xl
)
/
(
ffm
-
xl
*
p
);
xll
=
al
*
(
1.0
+
0.5
*
al
);
al
=
(
xr
-
ffm
)
/
(
xr
*
q
);
xlr
=
al
*
(
1.0
+
0.5
*
al
);
p2
=
p1
*
(
1.0
+
c
+
c
);
p3
=
p2
+
c
/
xll
;
p4
=
p3
+
c
/
xlr
;
}
}
else
if
(
n
==
nsave
)
{
if
(
np
<
30.0
)
goto
L_np_small
;
}
int
i
,
ix
;
for
(;;)
{
double
u
=
rand_open01
()
*
p4
;
double
v
=
rand_open01
();
if
(
u
<=
p1
)
{
ix
=
(
int
)
(
xm
-
p1
*
v
+
u
);
goto
finish
;
}
if
(
u
<=
p2
)
{
x
=
xl
+
(
u
-
p1
)
/
c
;
v
=
v
*
c
+
1.0
-
std
::
abs
(
xm
-
x
)
/
p1
;
if
(
v
>
1.0
||
v
<=
0.
)
continue
;
ix
=
(
int
)
x
;
}
else
{
if
(
u
>
p3
)
{
ix
=
(
int
)
(
xr
-
log
(
v
)
/
xlr
);
if
(
ix
>
n
)
continue
;
v
*=
(
u
-
p3
)
*
xlr
;
}
else
{
/* left tail */
ix
=
(
int
)
(
xl
+
log
(
v
)
/
xll
);
if
(
ix
<
0
)
continue
;
v
*=
(
u
-
p2
)
*
xll
;
}
}
int
k
=
std
::
abs
(
ix
-
m
);
if
(
k
<=
20
||
k
>=
npq
/
2
-
1
)
{
f
=
1.0
;
if
(
m
<
ix
)
{
for
(
i
=
m
+
1
;
i
<=
ix
;
i
++
)
f
*=
(
g
/
i
-
r
);
}
else
if
(
m
!=
ix
)
{
for
(
i
=
ix
+
1
;
i
<=
m
;
i
++
)
f
/=
(
g
/
i
-
r
);
}
if
(
v
<=
f
)
goto
finish
;
}
else
{
double
amaxp
,
ynorm
,
alv
;
amaxp
=
(
k
/
npq
)
*
((
k
*
(
k
/
3.
+
0.625
)
+
0.1666666666666
)
/
npq
+
0.5
);
ynorm
=
-
k
*
k
/
(
2.0
*
npq
);
alv
=
log
(
v
);
if
(
alv
<
ynorm
-
amaxp
)
goto
finish
;
if
(
alv
<=
ynorm
+
amaxp
)
{
double
x1
,
f1
,
z
,
w
,
z2
,
x2
,
f2
,
w2
;
x1
=
ix
+
1
;
f1
=
fm
+
1.0
;
z
=
n
+
1
-
fm
;
w
=
n
-
ix
+
1.0
;
z2
=
z
*
z
;
x2
=
x1
*
x1
;
f2
=
f1
*
f1
;
w2
=
w
*
w
;
if
(
alv
<=
xm
*
log
(
f1
/
x1
)
+
(
n
-
m
+
0.5
)
*
log
(
z
/
w
)
+
(
ix
-
m
)
*
log
(
w
*
p
/
(
x1
*
q
))
+
(
13860.0
-
(
462.0
-
(
132.0
-
(
99.0
-
140.0
/
f2
)
/
f2
)
/
f2
)
/
f2
)
/
f1
/
166320.0
+
(
13860.0
-
(
462.0
-
(
132.0
-
(
99.0
-
140.0
/
z2
)
/
z2
)
/
z2
)
/
z2
)
/
z
/
166320.0
+
(
13860.0
-
(
462.0
-
(
132.0
-
(
99.0
-
140.0
/
x2
)
/
x2
)
/
x2
)
/
x2
)
/
x1
/
166320.0
+
(
13860.0
-
(
462.0
-
(
132.0
-
(
99.0
-
140.0
/
w2
)
/
w2
)
/
w2
)
/
w2
)
/
w
/
166320.
)
goto
finish
;
}
}
}
L_np_small:
for
(;;)
{
ix
=
0
;
f
=
qn
;
double
u
=
rand_open01
();
for
(;;)
{
if
(
u
<
f
)
goto
finish
;
if
(
ix
>
110
)
break
;
u
-=
f
;
ix
++
;
f
*=
(
g
/
ix
-
r
);
}
}
finish:
if
(
psave
>
0.5
)
ix
=
n
-
ix
;
return
ix
;
}
// RNG::binomial
// __________________________________________________________________________
// Generate a sample of size 'n' from a multinomial distribution with
// probabilities given in 'probs'. Inspired by R source code.
void
RNG
::
multinom
(
uint
n
,
const
vector
<
double
>&
probs
,
vector
<
uint
>&
samp
)
{
samp
.
resize
(
probs
.
size
());
RNG
::
multinom
(
n
,
&
probs
[
0
],
(
uint
)
probs
.
size
(),
&
samp
[
0
]);
}
void
RNG
::
multinom
(
uint
size
,
const
double
*
probs
,
uint
num_probs
,
uint
*
samp
)
{
if
(
num_probs
==
0
)
return
;
for
(
uint
i
=
0
;
i
<
num_probs
;
i
++
)
samp
[
i
]
=
0
;
if
(
size
==
0
)
return
;
vector
<
double
>
fixed_probs
(
num_probs
);
double
total_prob
=
0.
;
for
(
uint
i
=
0
;
i
<
num_probs
;
i
++
)
{
const
double
pp
=
probs
[
i
];
//if (std::isfinite(pp) && pp >= 0)
if
((
pp
==
pp
)
&&
pp
>=
0
)
total_prob
+=
(
fixed_probs
[
i
]
=
pp
);
}
if
(
total_prob
==
0.
)
return
;
for
(
uint
i
=
0
;
i
<
num_probs
-
1
;
i
++
)
{
if
(
fixed_probs
[
i
]
>
0.
)
{
samp
[
i
]
=
binomial
(
fixed_probs
[
i
]
/
total_prob
,
size
);
size
-=
samp
[
i
];
}
if
(
size
==
0
)
return
;
total_prob
-=
fixed_probs
[
i
];
}
samp
[
num_probs
-
1
]
=
size
;
}
// RNG::multinomial
// __________________________________________________________________________
// rng.C
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment