Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
38f6c8f8
"platforms/cuda/vscode:/vscode.git/clone" did not exist on "80be998eaeef192761e07d15aec7b68a7d3b2e9a"
Commit
38f6c8f8
authored
Jan 27, 2009
by
Peter Eastman
Browse files
Checked in Cuda code
parent
95d79181
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9174 additions
and
0 deletions
+9174
-0
platforms/cuda/src/kernels/cudaKernels.h
platforms/cuda/src/kernels/cudaKernels.h
+98
-0
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+381
-0
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+2707
-0
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+277
-0
platforms/cuda/src/kernels/kBrownianUpdate.cu
platforms/cuda/src/kernels/kBrownianUpdate.cu
+152
-0
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+105
-0
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+388
-0
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+375
-0
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+454
-0
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
...forms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+422
-0
platforms/cuda/src/kernels/kCalculateLocalForces.cu
platforms/cuda/src/kernels/kCalculateLocalForces.cu
+495
-0
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+301
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+399
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+225
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+362
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+336
-0
platforms/cuda/src/kernels/kForces.cu
platforms/cuda/src/kernels/kForces.cu
+261
-0
platforms/cuda/src/kernels/kRandom.cu
platforms/cuda/src/kernels/kRandom.cu
+185
-0
platforms/cuda/src/kernels/kUpdateShakeH.cu
platforms/cuda/src/kernels/kUpdateShakeH.cu
+959
-0
platforms/cuda/src/kernels/kVerletUpdate.cu
platforms/cuda/src/kernels/kVerletUpdate.cu
+292
-0
No files found.
platforms/cuda/src/kernels/cudaKernels.h
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
// Initialization
extern
void
kClearForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornSum
(
gpuContext
gpu
);
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
// Main loop
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart2
(
gpuContext
gpu
);
// Extras
extern
void
kReduceForces
(
gpuContext
gpu
);
extern
void
kClearBornForces
(
gpuContext
gpu
);
// Initializers
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
GetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
SetRandomSim
(
gpuContext
gpu
);
extern
void
GetRandomSim
(
gpuContext
gpu
);
platforms/cuda/src/kernels/cudatypes.h
0 → 100755
View file @
38f6c8f8
#ifndef CUDATYPES_H
#define CUDATYPES_H
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdarg.h>
#include <limits>
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using
namespace
std
;
#define RTERROR(status, s) \
if
(
status
!=
cudaSuccess
)
{
\
printf
(
"%s %s
\n
"
,
s
,
cudaGetErrorString
(
status
));
\
exit
(
-
1
);
\
}
#define LAUNCHERROR(s) \
{
\
cudaError_t
status
=
cudaGetLastError
();
\
if
(
status
!=
cudaSuccess
)
{
\
printf
(
"Error: %s launching kernel %s
\n
"
,
cudaGetErrorString
(
status
),
s
);
\
exit
(
-
1
);
\
}
\
}
// Pure virtual class to define an interface for objects resident both on GPU and CPU
struct
SoADeviceObject
{
virtual
void
Allocate
()
=
0
;
virtual
void
Deallocate
()
=
0
;
virtual
void
Upload
()
=
0
;
virtual
void
Download
()
=
0
;
};
template
<
typename
T
>
struct
CUDAStream
:
public
SoADeviceObject
{
unsigned
int
_length
;
unsigned
int
_subStreams
;
unsigned
int
_stride
;
T
**
_pSysStream
;
T
**
_pDevStream
;
T
*
_pSysData
;
T
*
_pDevData
;
CUDAStream
(
int
length
,
int
subStreams
=
1
);
CUDAStream
(
unsigned
int
length
,
unsigned
int
subStreams
=
1
);
CUDAStream
(
unsigned
int
length
,
int
subStreams
=
1
);
CUDAStream
(
int
length
,
unsigned
int
subStreams
=
1
);
virtual
~
CUDAStream
();
void
Allocate
();
void
Deallocate
();
void
Upload
();
void
Download
();
void
Collapse
(
unsigned
int
newstreams
=
1
,
unsigned
int
interleave
=
1
);
};
float
CompareStreams
(
CUDAStream
<
float
>&
s1
,
CUDAStream
<
float
>&
s2
,
float
tolerance
,
unsigned
int
maxindex
=
0
);
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
int
length
,
unsigned
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
unsigned
int
length
,
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
unsigned
int
length
,
unsigned
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
int
length
,
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::~
CUDAStream
()
{
Deallocate
();
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Allocate
()
{
cudaError_t
status
;
_pSysStream
=
new
T
*
[
_subStreams
];
_pDevStream
=
new
T
*
[
_subStreams
];
_pSysData
=
new
T
[
_subStreams
*
_stride
];
status
=
cudaMalloc
((
void
**
)
&
_pDevData
,
_stride
*
_subStreams
*
sizeof
(
T
));
RTERROR
(
status
,
"cudaMalloc CUDAStream::Allocate failed"
);
for
(
unsigned
int
i
=
0
;
i
<
_subStreams
;
i
++
)
{
_pSysStream
[
i
]
=
_pSysData
+
i
*
_stride
;
_pDevStream
[
i
]
=
_pDevData
+
i
*
_stride
;
}
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Deallocate
()
{
cudaError_t
status
;
delete
[]
_pSysStream
;
_pSysStream
=
NULL
;
delete
[]
_pDevStream
;
_pDevStream
=
NULL
;
delete
[]
_pSysData
;
_pSysData
=
NULL
;
status
=
cudaFree
(
_pDevData
);
RTERROR
(
status
,
"cudaFree CUDAStream::Deallocate failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Upload
()
{
cudaError_t
status
;
status
=
cudaMemcpy
(
_pDevData
,
_pSysData
,
_stride
*
_subStreams
*
sizeof
(
T
),
cudaMemcpyHostToDevice
);
RTERROR
(
status
,
"cudaMemcpy CUDAStream::Upload failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Download
()
{
cudaError_t
status
;
status
=
cudaMemcpy
(
_pSysData
,
_pDevData
,
_stride
*
_subStreams
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
RTERROR
(
status
,
"cudaMemcpy CUDAStream::Download failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Collapse
(
unsigned
int
newstreams
,
unsigned
int
interleave
)
{
T
*
pTemp
=
new
T
[
_subStreams
*
_stride
];
unsigned
int
stream
=
0
;
unsigned
int
pos
=
0
;
unsigned
int
newstride
=
_stride
*
_subStreams
/
newstreams
;
unsigned
int
newlength
=
_length
*
_subStreams
/
newstreams
;
// Copy data into new format
for
(
unsigned
int
i
=
0
;
i
<
_length
;
i
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
_subStreams
;
j
++
)
{
pTemp
[
stream
*
newstride
+
pos
]
=
_pSysStream
[
j
][
i
];
stream
++
;
if
(
stream
==
newstreams
)
{
stream
=
0
;
pos
++
;
}
}
}
// Remap stream pointers;
for
(
unsigned
int
i
=
0
;
i
<
newstreams
;
i
++
)
{
_pSysStream
[
i
]
=
_pSysData
+
i
*
newstride
;
_pDevStream
[
i
]
=
_pDevData
+
i
*
newstride
;
}
// Copy data back intro original stream
for
(
unsigned
int
i
=
0
;
i
<
newlength
;
i
++
)
for
(
unsigned
int
j
=
0
;
j
<
newstreams
;
j
++
)
_pSysStream
[
j
][
i
]
=
pTemp
[
j
*
newstride
+
i
];
_stride
=
newstride
;
_length
=
newlength
;
_subStreams
=
newstreams
;
delete
[]
pTemp
;
}
static
const
int
GRID
=
32
;
static
const
int
GRIDBITS
=
5
;
static
const
int
G8X_NONBOND_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_NONBOND_THREADS_PER_BLOCK
=
320
;
static
const
int
G8X_BORNFORCE2_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
=
320
;
static
const
int
G8X_SHAKE_THREADS_PER_BLOCK
=
128
;
static
const
int
GT2XX_SHAKE_THREADS_PER_BLOCK
=
256
;
static
const
int
G8X_UPDATE_THREADS_PER_BLOCK
=
192
;
static
const
int
GT2XX_UPDATE_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_LOCALFORCES_THREADS_PER_BLOCK
=
192
;
static
const
int
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_THREADS_PER_BLOCK
=
256
;
static
const
int
G8X_RANDOM_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_RANDOM_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
struct
cudaGmxSimulation
{
// Constants
unsigned
int
atoms
;
// Number of atoms
unsigned
int
paddedNumberOfAtoms
;
// Padded number of atoms
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
max_update_threads_per_block
;
// Maximum threads per block in update kernel calls
unsigned
int
update_threads_per_block
;
// Threads per block in update kernel calls
unsigned
int
bf_reduce_threads_per_block
;
// Threads per block in Born Force reduction calls
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
workUnits
;
// Number of work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
unsigned
int
nbWorkUnitsPerBlock
;
// Number of work units assigned to each nonbond block
unsigned
int
nbWorkUnitsPerBlockRemainder
;
// Remainder of work units to assign across lower numbered nonbond blocks
unsigned
int
bf2WorkUnitsPerBlock
;
// Number of work units assigned to each bornForce2 block
unsigned
int
bf2WorkUnitsPerBlockRemainder
;
// Remainder of work units to assign across lower numbered bornForce2 blocks
unsigned
int
stride
;
// Atomic attributes stride
unsigned
int
stride2
;
// Atomic attributes stride x 2
unsigned
int
stride3
;
// Atomic attributes stride x 3
unsigned
int
stride4
;
// Atomic attributes stride x 4
unsigned
int
exclusionStride
;
// Exclusion list stride = stride / GRID
unsigned
int
nonbondOutputBuffers
;
// Nonbond output buffers per nonbond call
unsigned
int
totalNonbondOutputBuffers
;
// Total nonbond output buffers
unsigned
int
outputBuffers
;
// Number of output buffers
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
epsfac
;
// Epsilon factor for CDLJ calculations
float
probeRadius
;
// SASA probe radius
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
electricConstant
;
// ACE approximation electric constant
float
forceConversionFactor
;
// kJ to kcal force conversion factor
float
preFactor
;
// Born electrostatic pre-factor
float
dielectricOffset
;
// Born dielectric offset
float
alphaOBC
;
// OBC alpha factor
float
betaOBC
;
// OBC beta factor
float
gammaOBC
;
// OBC gamma factor
float
deltaT
;
// Molecular dynamics deltaT constant
float
oneOverDeltaT
;
// 1/deltaT
float
B
;
// Molecular dynamics B constant
float
C
;
// Molecular dynamics C constant
float
D
;
// Molecular dynamics D constant
float
EPH
;
// Molecular dynamics EPH constant
float
EMH
;
// Molecular dynamics EMH constant
float
EM
;
// Molecular dynamics EM constant
float
EP
;
// Molecular dynamics EP constant
float
GDT
;
// Molecular dynamics GDT constant
float
OneMinusEM
;
// Molecular dynamics OneMinusEM constant
float
TauOneMinusEM
;
// Molecular dynamics TauOneMinusEM constant
float
TauDOverEMMinusOne
;
// Molecular dynamics TauDOverEMMinusOne constant
float
T
;
// Molecular dynamics T constant
float
kT
;
// Boltzmann's constant times T
float
V
;
// Molecular dynamics V constant
float
X
;
// Molecular dynamics X constant
float
Yv
;
// Molecular dynamics Yv constant
float
Yx
;
// Molecular dynamics Yx constant
float
tau
;
// Molecular dynamics tau constant
float
fix1
;
// Molecular dynamics fix1 constant
float
oneOverFix1
;
// Molecular dynamics reciprocal of fix1 constant
float
DOverTauC
;
// Molecular dynamics DOverTauC constant
float
collisionProbability
;
// Collision probability for Andersen thermostat
float2
*
pObcData
;
// Pointer to fixed Born data
float2
*
pAttr
;
// Pointer to additional atom attributes (sig, eps)
unsigned
int
bonds
;
// Number of bonds
int4
*
pBondID
;
// Bond atom and output buffer IDs
float2
*
pBondParameter
;
// Bond parameters
unsigned
int
bond_angles
;
// Number of bond angles
int4
*
pBondAngleID1
;
// Bond angle atom and first output buffer IDs
int2
*
pBondAngleID2
;
// Bond angle output buffer IDs
float2
*
pBondAngleParameter
;
// Bond angle parameters
unsigned
int
dihedrals
;
// Number of dihedrals
int4
*
pDihedralID1
;
// Dihedral IDs
int4
*
pDihedralID2
;
// Dihedral output buffer IDs
float4
*
pDihedralParameter
;
// Dihedral parameters
unsigned
int
rb_dihedrals
;
// Number of Ryckaert Bellemans dihedrals
int4
*
pRbDihedralID1
;
// Ryckaert Bellemans Dihedral IDs
int4
*
pRbDihedralID2
;
// Ryckaert Bellemans Dihedral output buffer IDs
float4
*
pRbDihedralParameter1
;
// Ryckaert Bellemans Dihedral parameters
float2
*
pRbDihedralParameter2
;
// Ryckaert Bellemans Dihedral parameters
unsigned
int
LJ14s
;
// Number of Lennard Jones 1-4 interactions
int4
*
pLJ14ID
;
// Lennard Jones 1-4 atom and output buffer IDs
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float
inverseTotalMass
;
// Used in linear momentum removal
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
float
shakeTolerance
;
// Shake tolerance
float
InvMassJ
;
// Shake inverse mass for hydrogens
int
*
pNonShakeID
;
// Not Shaking atoms
int4
*
pShakeID
;
// Shake atoms and phase
float4
*
pShakeParameter
;
// Shake parameters
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
// Mutable stuff
float4
*
pPosq
;
// Pointer to atom positions and charges
float4
*
pPosqP
;
// Pointer to mid-integration atom positions
float4
*
pOldPosq
;
// Pointer to old atom positions
float4
*
pVelm4
;
// Pointer to atom velocity and inverse mass
float4
*
pvVector4
;
// Pointer to atom v Vector
float4
*
pxVector4
;
// Pointer to atom x Vector
float4
*
pForce4
;
// Pointer to all force4 data
float4
*
pForce4a
;
// Pointer to first set of force4 data
float4
*
pForce4b
;
// Pointer to second set of force4 data
float4
*
pOutForce4
;
// Pointer to output float4 force
float
*
pBornForce
;
// Pointer to Born force data
float
*
pBornSum
;
// Pointer to Born Radii calculation output buffers
float
*
pBornRadii
;
// Pointer to Born Radii
float
*
pObcChain
;
// Pointer to OBC chain data
float4
*
pLinearMomentum
;
// Pointer to linear momentum
// Random numbers
float4
*
pRandom4a
;
// Pointer to first set of 4 random numbers
float4
*
pRandom4b
;
// Pointer to second set of 4 random numbers
float2
*
pRandom2a
;
// Pointer to first set of 2 random numbers
float2
*
pRandom2b
;
// Pointer to second set of 2 random numbers
uint4
*
pRandomSeed
;
// Pointer to random seeds
int
*
pRandomPosition
;
// Pointer to random number positions
unsigned
int
randoms
;
// Number of randoms
unsigned
int
totalRandoms
;
// Number of randoms plus overflow.
unsigned
int
totalRandomsTimesTwo
;
// Used for generating randoms
unsigned
int
randomIterations
;
// Number of iterations before regenerating randoms
unsigned
int
randomFrames
;
// Number of frames of random numbers
};
struct
Vectors
{
float3
v0
;
float3
v1
;
float3
v2
;
};
#endif
platforms/cuda/src/kernels/gpu.cpp
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <ctime>
#include <cmath>
#include <map>
#ifdef WIN32
#include <windows.h>
#else
#include <stdint.h>
#endif
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
#include "OpenMMException.h"
using
OpenMM
::
OpenMMException
;
#ifdef WIN32
typedef
unsigned
__int64
u64
;
typedef
signed
__int64
s64
;
#else
typedef
uint64_t
u64
;
typedef
int64_t
s64
;
#endif
typedef
unsigned
int
u32
;
typedef
float
f32
;
typedef
double
f64
;
typedef
char
ascii
;
typedef
char
utf8
;
typedef
unsigned
char
u8
;
typedef
signed
char
s8
;
typedef
unsigned
short
u16
;
typedef
signed
short
s16
;
typedef
struct
{
u8
type
[
4
];
f32
charge
;
f32
radius
;
}
FAH_ATOM
;
typedef
struct
{
u32
a
;
/* rule: a < b */
u32
b
;
}
FAH_BOND
;
typedef
struct
{
f32
x
;
f32
y
;
f32
z
;
}
FAH_XYZ
;
typedef
struct
{
u32
magic
;
u32
version
;
utf8
name
[
64
];
s64
timestamp
;
u64
iterations
;
u32
frames
;
u32
atom_count
;
u32
bond_count
;
/* v2 */
utf8
user_name
[
64
];
utf8
user_team
[
16
];
utf8
user_done
[
16
];
}
FAH_INFO
;
typedef
struct
{
u32
magic
;
u32
version
;
s64
timestamp
;
u64
iterations_done
;
u32
frames_done
;
f32
energy
;
f32
temperature
;
}
FAH_CURRENT
;
typedef
struct
{
FAH_INFO
info
;
FAH_CURRENT
current
;
FAH_ATOM
*
atoms
;
FAH_BOND
*
bonds
;
FAH_XYZ
*
xyz
;
}
PROTEIN
;
struct
ShakeCluster
{
int
centralID
;
int
peripheralID
[
3
];
int
size
;
float
distance
;
float
centralInvMass
,
peripheralInvMass
;
ShakeCluster
()
{
}
ShakeCluster
(
int
centralID
,
float
invMass
)
:
centralID
(
centralID
),
centralInvMass
(
invMass
),
size
(
0
)
{
}
void
addAtom
(
int
id
,
float
dist
,
float
invMass
)
{
if
(
size
==
3
)
throw
OpenMMException
(
"A single atom may only have three constraints"
);
if
(
size
>
0
&&
dist
!=
distance
)
throw
OpenMMException
(
"All constraints for a central atom must have the same distance"
);
if
(
size
>
0
&&
invMass
!=
peripheralInvMass
)
throw
OpenMMException
(
"All constraints for a central atom must have the same mass"
);
peripheralID
[
size
++
]
=
id
;
distance
=
dist
;
peripheralInvMass
=
invMass
;
}
};
static
const
float
dielectricOffset
=
0.009
f
;
static
const
float
PI
=
3.1415926535
f
;
static
const
float
probeRadius
=
0.14
f
;
static
const
float
forceConversionFactor
=
0.4184
f
;
//static const float surfaceAreaFactor = -6.0f * 0.06786f * forceConversionFactor * 1000.0f; // PI * 4.0f * 0.0049f * 1000.0f;
//static const float surfaceAreaFactor = -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
static
const
float
surfaceAreaFactor
=
-
6.0
f
*
PI
*
0.0216
f
*
1000.0
f
*
0.4184
f
;
//static const float surfaceAreaFactor = -1.7035573959e+001;
//static const float surfaceAreaFactor = -166.02691f;
//static const float surfaceAreaFactor = 1.0f;
static
const
float
alphaOBC
=
1.0
f
;
static
const
float
betaOBC
=
0.8
f
;
static
const
float
gammaOBC
=
4.85
f
;
static
const
float
kcalMolTokJNM
=
-
0.4184
f
;
static
const
float
electricConstant
=
-
166.02691
f
;
static
const
float
defaultInnerDielectric
=
1.0
f
;
static
const
float
defaultSolventDielectric
=
78.3
f
;
static
const
float
KILO
=
1e3
;
// Thousand
static
const
float
BOLTZMANN
=
1.380658e-23
f
;
// (J/K)
static
const
float
AVOGADRO
=
6.0221367e23
f
;
// ()
static
const
float
RGAS
=
BOLTZMANN
*
AVOGADRO
;
// (J/(mol K))
static
const
float
BOLTZ
=
(
RGAS
/
KILO
);
// (kJ/(mol K))
#define DUMP_PARAMETERS 0
#define DeltaShake
extern
"C"
int
gpuReadBondParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
bonds
;
infile
>>
bonds
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
bonds
);
vector
<
int
>
atom2
(
bonds
);
vector
<
float
>
length
(
bonds
);
vector
<
float
>
k
(
bonds
);
for
(
int
i
=
0
;
i
<
bonds
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
length
[
i
]
>>
k
[
i
];
}
gpuSetBondParameters
(
gpu
,
atom1
,
atom2
,
length
,
k
);
return
bonds
;
}
else
{
cout
<<
"Error opening harmonic bond parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetBondParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
length
,
const
vector
<
float
>&
k
)
{
int
bonds
=
atom1
.
size
();
gpu
->
sim
.
bonds
=
bonds
;
CUDAStream
<
int4
>*
psBondID
=
new
CUDAStream
<
int4
>
(
bonds
,
1
);
gpu
->
psBondID
=
psBondID
;
gpu
->
sim
.
pBondID
=
psBondID
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psBondParameter
=
new
CUDAStream
<
float2
>
(
bonds
,
1
);
gpu
->
psBondParameter
=
psBondParameter
;
gpu
->
sim
.
pBondParameter
=
psBondParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
bonds
;
i
++
)
{
psBondID
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psBondID
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psBondParameter
->
_pSysStream
[
0
][
i
].
x
=
length
[
i
];
psBondParameter
->
_pSysStream
[
0
][
i
].
y
=
k
[
i
];
psBondID
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psBondID
->
_pSysStream
[
0
][
i
].
x
]
++
;
psBondID
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psBondID
->
_pSysStream
[
0
][
i
].
y
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psBondParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondParameter
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psBondID
->
Upload
();
psBondParameter
->
Upload
();
}
extern
"C"
int
gpuReadBondAngleParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
bond_angles
;
infile
>>
bond_angles
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
bond_angles
);
vector
<
int
>
atom2
(
bond_angles
);
vector
<
int
>
atom3
(
bond_angles
);
vector
<
float
>
angle
(
bond_angles
);
vector
<
float
>
k
(
bond_angles
);
for
(
int
i
=
0
;
i
<
bond_angles
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
angle
[
i
]
>>
k
[
i
];
}
gpuSetBondAngleParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
angle
,
k
);
return
bond_angles
;
}
else
{
cout
<<
"Error opening harmonic bond angle parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetBondAngleParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
float
>&
angle
,
const
vector
<
float
>&
k
)
{
int
bond_angles
=
atom1
.
size
();
gpu
->
sim
.
bond_angles
=
bond_angles
;
CUDAStream
<
int4
>*
psBondAngleID1
=
new
CUDAStream
<
int4
>
(
bond_angles
,
1
);
gpu
->
psBondAngleID1
=
psBondAngleID1
;
gpu
->
sim
.
pBondAngleID1
=
psBondAngleID1
->
_pDevStream
[
0
];
CUDAStream
<
int2
>*
psBondAngleID2
=
new
CUDAStream
<
int2
>
(
bond_angles
,
1
);
gpu
->
psBondAngleID2
=
psBondAngleID2
;
gpu
->
sim
.
pBondAngleID2
=
psBondAngleID2
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psBondAngleParameter
=
new
CUDAStream
<
float2
>
(
bond_angles
,
1
);
gpu
->
psBondAngleParameter
=
psBondAngleParameter
;
gpu
->
sim
.
pBondAngleParameter
=
psBondAngleParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
bond_angles
;
i
++
)
{
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
x
=
angle
[
i
];
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
y
=
k
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psBondAngleID1
->
Upload
();
psBondAngleID2
->
Upload
();
psBondAngleParameter
->
Upload
();
}
extern
"C"
int
gpuReadDihedralParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
dihedrals
;
infile
>>
dihedrals
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
dihedrals
);
vector
<
int
>
atom2
(
dihedrals
);
vector
<
int
>
atom3
(
dihedrals
);
vector
<
int
>
atom4
(
dihedrals
);
vector
<
float
>
k
(
dihedrals
);
vector
<
float
>
phase
(
dihedrals
);
vector
<
int
>
periodicity
(
dihedrals
);
for
(
int
i
=
0
;
i
<
dihedrals
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
atom4
[
i
]
>>
k
[
i
]
>>
phase
[
i
]
>>
periodicity
[
i
];
}
gpuSetDihedralParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
atom4
,
k
,
phase
,
periodicity
);
return
dihedrals
;
}
else
{
cout
<<
"Error opening dihedral parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetDihedralParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
int
>&
atom4
,
const
vector
<
float
>&
k
,
const
vector
<
float
>&
phase
,
const
vector
<
int
>&
periodicity
)
{
int
dihedrals
=
atom1
.
size
();
gpu
->
sim
.
dihedrals
=
dihedrals
;
CUDAStream
<
int4
>*
psDihedralID1
=
new
CUDAStream
<
int4
>
(
dihedrals
,
1
);
gpu
->
psDihedralID1
=
psDihedralID1
;
gpu
->
sim
.
pDihedralID1
=
psDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psDihedralID2
=
new
CUDAStream
<
int4
>
(
dihedrals
,
1
);
gpu
->
psDihedralID2
=
psDihedralID2
;
gpu
->
sim
.
pDihedralID2
=
psDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psDihedralParameter
=
new
CUDAStream
<
float4
>
(
dihedrals
,
1
);
gpu
->
psDihedralParameter
=
psDihedralParameter
;
gpu
->
sim
.
pDihedralParameter
=
psDihedralParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
dihedrals
;
i
++
)
{
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
=
atom4
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
x
=
k
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
y
=
phase
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
z
=
(
float
)
periodicity
[
i
];
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
z
<<
endl
;
#endif
}
psDihedralID1
->
Upload
();
psDihedralID2
->
Upload
();
psDihedralParameter
->
Upload
();
}
extern
"C"
int
gpuReadRbDihedralParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
rb_dihedrals
;
infile
>>
rb_dihedrals
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
rb_dihedrals
);
vector
<
int
>
atom2
(
rb_dihedrals
);
vector
<
int
>
atom3
(
rb_dihedrals
);
vector
<
int
>
atom4
(
rb_dihedrals
);
vector
<
float
>
c0
(
rb_dihedrals
);
vector
<
float
>
c1
(
rb_dihedrals
);
vector
<
float
>
c2
(
rb_dihedrals
);
vector
<
float
>
c3
(
rb_dihedrals
);
vector
<
float
>
c4
(
rb_dihedrals
);
vector
<
float
>
c5
(
rb_dihedrals
);
gpu
->
sim
.
rb_dihedrals
=
rb_dihedrals
;
CUDAStream
<
int4
>*
psRbDihedralID1
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID1
=
psRbDihedralID1
;
gpu
->
sim
.
pRbDihedralID1
=
psRbDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psRbDihedralID2
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID2
=
psRbDihedralID2
;
gpu
->
sim
.
pRbDihedralID2
=
psRbDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psRbDihedralParameter1
=
new
CUDAStream
<
float4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter1
=
psRbDihedralParameter1
;
gpu
->
sim
.
pRbDihedralParameter1
=
psRbDihedralParameter1
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psRbDihedralParameter2
=
new
CUDAStream
<
float2
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter2
=
psRbDihedralParameter2
;
gpu
->
sim
.
pRbDihedralParameter2
=
psRbDihedralParameter2
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
rb_dihedrals
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
atom4
[
i
]
>>
c0
[
i
]
>>
c1
[
i
]
>>
c2
[
i
]
>>
c3
[
i
]
>>
c4
[
i
]
>>
c5
[
i
];
}
gpuSetRbDihedralParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
atom4
,
c0
,
c1
,
c2
,
c3
,
c4
,
c5
);
return
rb_dihedrals
;
}
else
{
cout
<<
"Error opening Ryckaert-Bellemans dihedral parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetRbDihedralParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
int
>&
atom4
,
const
vector
<
float
>&
c0
,
const
vector
<
float
>&
c1
,
const
vector
<
float
>&
c2
,
const
vector
<
float
>&
c3
,
const
vector
<
float
>&
c4
,
const
vector
<
float
>&
c5
)
{
int
rb_dihedrals
=
atom1
.
size
();
gpu
->
sim
.
rb_dihedrals
=
rb_dihedrals
;
CUDAStream
<
int4
>*
psRbDihedralID1
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID1
=
psRbDihedralID1
;
gpu
->
sim
.
pRbDihedralID1
=
psRbDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psRbDihedralID2
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID2
=
psRbDihedralID2
;
gpu
->
sim
.
pRbDihedralID2
=
psRbDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psRbDihedralParameter1
=
new
CUDAStream
<
float4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter1
=
psRbDihedralParameter1
;
gpu
->
sim
.
pRbDihedralParameter1
=
psRbDihedralParameter1
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psRbDihedralParameter2
=
new
CUDAStream
<
float2
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter2
=
psRbDihedralParameter2
;
gpu
->
sim
.
pRbDihedralParameter2
=
psRbDihedralParameter2
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
rb_dihedrals
;
i
++
)
{
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
=
atom4
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
x
=
c0
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
y
=
c1
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
z
=
c2
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
w
=
c3
[
i
];
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
x
=
c4
[
i
];
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
y
=
c5
[
i
];
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psRbDihedralID1
->
Upload
();
psRbDihedralID2
->
Upload
();
psRbDihedralParameter1
->
Upload
();
psRbDihedralParameter2
->
Upload
();
}
extern
"C"
int
gpuReadLJ14Parameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
float
epsfac
=
0.0
f
;
float
fudge
=
0.0
f
;
int
LJ14s
;
infile
>>
LJ14s
;
infile
.
get
(
buff
,
61
);
// cout << buff << endl;
infile
>>
epsfac
;
infile
.
get
(
buff
,
8
);
infile
>>
fudge
;
infile
.
getline
(
buff
,
512
);
// cout << buff << endl;
vector
<
int
>
atom1
(
LJ14s
);
vector
<
int
>
atom2
(
LJ14s
);
vector
<
float
>
c6
(
LJ14s
);
vector
<
float
>
c12
(
LJ14s
);
vector
<
float
>
q1
(
LJ14s
);
vector
<
float
>
q2
(
LJ14s
);
for
(
int
i
=
0
;
i
<
LJ14s
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
c6
[
i
]
>>
c12
[
i
]
>>
q1
[
i
]
>>
q2
[
i
];
}
gpuSetLJ14Parameters
(
gpu
,
epsfac
,
fudge
,
atom1
,
atom2
,
c6
,
c12
,
q1
,
q2
);
return
LJ14s
;
}
else
{
cout
<<
"Error opening Lennard-Jones 1-4 parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetLJ14Parameters
(
gpuContext
gpu
,
float
epsfac
,
float
fudge
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q1
,
const
vector
<
float
>&
q2
)
{
int
LJ14s
=
atom1
.
size
();
float
scale
=
epsfac
*
fudge
;
gpu
->
sim
.
LJ14s
=
LJ14s
;
CUDAStream
<
int4
>*
psLJ14ID
=
new
CUDAStream
<
int4
>
(
LJ14s
,
1
);
gpu
->
psLJ14ID
=
psLJ14ID
;
gpu
->
sim
.
pLJ14ID
=
psLJ14ID
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psLJ14Parameter
=
new
CUDAStream
<
float4
>
(
LJ14s
,
1
);
gpu
->
psLJ14Parameter
=
psLJ14Parameter
;
gpu
->
sim
.
pLJ14Parameter
=
psLJ14Parameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
LJ14s
;
i
++
)
{
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
]
++
;
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
]
++
;
float
p0
,
p1
,
p2
;
if
(
c12
[
i
]
==
0.0
f
)
{
p0
=
0.0
f
;
p1
=
1.0
f
;
}
else
{
p0
=
c6
[
i
]
*
c6
[
i
]
/
c12
[
i
];
p1
=
pow
(
c12
[
i
]
/
c6
[
i
],
1.0
f
/
6.0
f
);
}
p2
=
scale
*
q1
[
i
]
*
q2
[
i
];
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
x
=
p0
;
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
y
=
p1
;
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
z
=
p2
;
}
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
p0
<<
" "
<<
p1
<<
" "
<<
p2
<<
" "
<<
endl
;
#endif
psLJ14ID
->
Upload
();
psLJ14Parameter
->
Upload
();
}
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
)
{
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
if
(
s
==
gpu
->
gpAtomTable
[
i
].
name
)
{
return
gpu
->
gpAtomTable
[
i
].
r
;
}
}
return
0.0
f
;
}
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
)
{
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
if
(
s
==
gpu
->
gpAtomTable
[
i
].
name
)
{
return
gpu
->
gpAtomTable
[
i
].
symbol
;
}
}
return
' '
;
}
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
)
{
gpu
->
gAtomTypes
=
0
;
if
(
gpu
->
gpAtomTable
)
delete
[]
gpu
->
gpAtomTable
;
// Read file once to count atom types
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
int
skips
=
0
;
bool
skipflag
=
true
;
while
(
infile
.
getline
(
buff
,
512
))
{
if
(
buff
[
0
]
==
' '
)
{
skipflag
=
false
;
gpu
->
gAtomTypes
++
;
}
else
if
(
skipflag
)
skips
++
;
}
infile
.
close
();
gpu
->
gpAtomTable
=
new
gpuAtomType
[
gpu
->
gAtomTypes
];
ifstream
infile1
(
fname
);
for
(
int
i
=
0
;
i
<
skips
;
i
++
)
{
infile1
.
getline
(
buff
,
512
);
}
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
infile1
>>
gpu
->
gpAtomTable
[
i
].
name
>>
gpu
->
gpAtomTable
[
i
].
r
;
infile1
.
getline
(
buff
,
512
);
// Determine symbol
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.3
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'H'
;
else
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.6
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'O'
;
else
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.7
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'N'
;
else
gpu
->
gpAtomTable
[
i
].
symbol
=
'C'
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
name
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
symbol
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
r
<<
endl
;
#endif
}
return
gpu
->
gAtomTypes
;
}
else
{
cout
<<
"Error opening atom parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
int
gpuReadCoulombParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
unsigned
int
coulombs
;
float
fudge
=
0.0
f
;
float
epsfac
=
1.0
f
;
infile
>>
coulombs
;
infile
.
get
(
buff
,
9
);
infile
>>
epsfac
;
infile
.
get
(
buff
,
8
);
infile
>>
fudge
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom
(
coulombs
);
vector
<
float
>
c6
(
coulombs
);
vector
<
float
>
c12
(
coulombs
);
vector
<
float
>
q
(
coulombs
);
vector
<
float
>
radius
(
coulombs
);
vector
<
float
>
scale
(
coulombs
);
vector
<
char
>
symbol
(
coulombs
);
vector
<
vector
<
int
>
>
exclusions
(
coulombs
);
unsigned
int
total_exclusions
=
0
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
int
junk
,
numExclusions
;
char
atype
[
512
];
infile
>>
junk
>>
c6
[
i
]
>>
c12
[
i
]
>>
q
[
i
]
>>
atype
>>
scale
[
i
]
>>
numExclusions
;
radius
[
i
]
=
gpuGetAtomicRadius
(
gpu
,
atype
);
symbol
[
i
]
=
gpuGetAtomicSymbol
(
gpu
,
atype
);
for
(
int
j
=
0
;
j
<
numExclusions
;
j
++
)
{
int
exclusion
;
infile
>>
exclusion
;
exclusions
[
i
].
push_back
(
exclusion
);
}
}
cout
<<
total_exclusions
<<
" total exclusions.
\n
"
;
gpuSetCoulombParameters
(
gpu
,
epsfac
,
atom
,
c6
,
c12
,
q
,
symbol
,
exclusions
);
gpuSetObcParameters
(
gpu
,
defaultInnerDielectric
,
defaultSolventDielectric
,
atom
,
radius
,
scale
);
return
coulombs
;
}
else
{
cout
<<
"Error opening Coulomb parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q
,
const
vector
<
char
>&
symbol
,
const
vector
<
vector
<
int
>
>&
exclusions
)
{
unsigned
int
coulombs
=
atom
.
size
();
gpu
->
sim
.
epsfac
=
epsfac
;
unsigned
int
total_exclusions
=
0
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
float
p0
=
q
[
i
];
float
p1
=
0.5
f
,
p2
=
0.0
f
;
if
((
c6
[
i
]
>
0.0
f
)
&&
(
c12
[
i
]
>
0.0
f
))
{
p1
=
0.5
f
*
pow
(
c12
[
i
]
/
c6
[
i
],
1.0
f
/
6.0
f
);
p2
=
c6
[
i
]
*
sqrt
(
1.0
f
/
c12
[
i
]);
}
if
(
symbol
.
size
()
>
0
)
gpu
->
pAtomSymbol
[
i
]
=
symbol
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
p0
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
p1
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
p2
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
p0
<<
" "
<<
p1
<<
" "
<<
p2
<<
" "
<<
exclusions
;
#endif
for
(
int
j
=
0
;
j
<
(
int
)
exclusions
[
i
].
size
();
j
++
)
{
#if (DUMP_PARAMETERS == 1)
cout
<<
" "
<<
exclusions
[
i
][
j
];
#endif
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
exclusions
[
i
][
j
]]
=
0
;
if
(
i
>=
(
int
)
exclusions
[
i
][
j
])
{
total_exclusions
++
;
}
}
#if (DUMP_PARAMETERS == 1)
cout
<<
endl
;
#endif
}
// Dummy out extra atom data
for
(
unsigned
int
i
=
coulombs
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
}
// Add in remaining exclusions
for
(
unsigned
int
i
=
coulombs
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
gpu
->
sim
.
paddedNumberOfAtoms
;
j
++
)
{
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
=
0
;
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
]
=
0
;
}
}
gpu
->
psPosq4
->
Upload
();
gpu
->
psSigEps2
->
Upload
();
// Check for exclusion consistency
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
for
(
unsigned
int
j
=
i
;
j
<
coulombs
;
j
++
)
{
if
(
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
!=
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
])
cout
<<
"Warning: inconsistent exclusion betweens atoms "
<<
i
<<
" and "
<<
j
<<
endl
;
}
}
}
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
radius
,
const
vector
<
float
>&
scale
)
{
unsigned
int
atoms
=
atom
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
atoms
;
i
++
)
{
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
radius
[
i
]
-
dielectricOffset
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
=
scale
[
i
]
*
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
;
#endif
}
// Dummy out extra atom data
for
(
unsigned
int
i
=
atoms
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
=
0.2
f
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
0.01
f
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
=
0.01
f
;
}
gpu
->
psBornRadii
->
Upload
();
gpu
->
psObcData
->
Upload
();
gpu
->
sim
.
preFactor
=
2.0
f
*
electricConstant
*
((
1.0
f
/
innerDielectric
)
-
(
1.0
f
/
solventDielectric
))
*
gpu
->
sim
.
forceConversionFactor
;
}
extern
"C"
int
gpuReadShakeParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
shake_constraints
;
infile
>>
buff
>>
shake_constraints
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
shake_constraints
);
vector
<
int
>
atom2
(
shake_constraints
);
vector
<
float
>
distance
(
shake_constraints
);
vector
<
float
>
invMass1
(
shake_constraints
);
vector
<
float
>
invMass2
(
shake_constraints
);
for
(
int
i
=
0
;
i
<
shake_constraints
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
distance
[
i
]
>>
invMass1
[
i
]
>>
invMass2
[
i
];
}
gpuSetShakeParameters
(
gpu
,
atom1
,
atom2
,
distance
,
invMass1
,
invMass2
,
1e-4
f
);
return
gpu
->
sim
.
ShakeConstraints
;
}
else
{
cout
<<
"Error opening Shake parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetShakeParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
distance
,
const
vector
<
float
>&
invMass1
,
const
vector
<
float
>&
invMass2
,
float
tolerance
)
{
// Find how many constraints each atom is involved in.
vector
<
int
>
constraintCount
(
gpu
->
natoms
,
0
);
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
constraintCount
[
atom1
[
i
]]
++
;
constraintCount
[
atom2
[
i
]]
++
;
}
// Find clusters consisting of a central atom with up to three peripheral atoms.
map
<
int
,
ShakeCluster
>
clusters
;
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
// Determine which is the central atom.
bool
firstIsCentral
;
if
(
constraintCount
[
atom1
[
i
]]
>
1
)
firstIsCentral
=
true
;
else
if
(
constraintCount
[
atom2
[
i
]]
>
1
)
firstIsCentral
=
false
;
else
if
(
atom1
[
i
]
<
atom2
[
i
])
firstIsCentral
=
true
;
else
firstIsCentral
=
false
;
int
centralID
,
peripheralID
;
float
centralInvMass
,
peripheralInvMass
;
if
(
firstIsCentral
)
{
centralID
=
atom1
[
i
];
peripheralID
=
atom2
[
i
];
centralInvMass
=
invMass1
[
i
];
peripheralInvMass
=
invMass2
[
i
];
}
else
{
centralID
=
atom2
[
i
];
peripheralID
=
atom1
[
i
];
centralInvMass
=
invMass2
[
i
];
peripheralInvMass
=
invMass1
[
i
];
}
if
(
constraintCount
[
peripheralID
]
!=
1
)
throw
OpenMMException
(
"Only bonds to hydrogens may be constrained"
);
// Add it to the cluster.
if
(
clusters
.
find
(
centralID
)
==
clusters
.
end
())
{
clusters
[
centralID
]
=
ShakeCluster
(
centralID
,
centralInvMass
);
}
clusters
[
centralID
].
addAtom
(
peripheralID
,
distance
[
i
],
peripheralInvMass
);
}
// Fill in the Cuda streams.
CUDAStream
<
int4
>*
psShakeID
=
new
CUDAStream
<
int4
>
((
int
)
clusters
.
size
(),
1
);
gpu
->
psShakeID
=
psShakeID
;
gpu
->
sim
.
pShakeID
=
psShakeID
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psShakeParameter
=
new
CUDAStream
<
float4
>
((
int
)
clusters
.
size
(),
1
);
gpu
->
psShakeParameter
=
psShakeParameter
;
gpu
->
sim
.
pShakeParameter
=
psShakeParameter
->
_pDevStream
[
0
];
gpu
->
sim
.
ShakeConstraints
=
clusters
.
size
();
int
index
=
0
;
for
(
map
<
int
,
ShakeCluster
>::
const_iterator
iter
=
clusters
.
begin
();
iter
!=
clusters
.
end
();
++
iter
)
{
const
ShakeCluster
&
cluster
=
iter
->
second
;
psShakeID
->
_pSysStream
[
0
][
index
].
x
=
cluster
.
centralID
;
psShakeID
->
_pSysStream
[
0
][
index
].
y
=
cluster
.
peripheralID
[
0
];
psShakeID
->
_pSysStream
[
0
][
index
].
z
=
cluster
.
size
>
1
?
cluster
.
peripheralID
[
1
]
:
-
1
;
psShakeID
->
_pSysStream
[
0
][
index
].
w
=
cluster
.
size
>
2
?
cluster
.
peripheralID
[
2
]
:
-
1
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
x
=
cluster
.
centralInvMass
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
y
=
0.5
f
/
(
cluster
.
centralInvMass
+
cluster
.
peripheralInvMass
);
psShakeParameter
->
_pSysStream
[
0
][
index
].
z
=
cluster
.
distance
*
cluster
.
distance
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
w
=
cluster
.
peripheralInvMass
;
++
index
;
}
psShakeID
->
Upload
();
psShakeParameter
->
Upload
();
gpu
->
sim
.
shakeTolerance
=
tolerance
;
gpu
->
sim
.
shake_threads_per_block
=
(
gpu
->
sim
.
ShakeConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
shake_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
shake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
shake_threads_per_block
<
1
)
gpu
->
sim
.
shake_threads_per_block
=
1
;
#ifdef DeltaShake
// count number of atoms w/o constraint
int
count
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
if
(
constraintCount
[
i
]
==
0
)
count
++
;
// Allocate NonShake parameters
gpu
->
sim
.
NonShakeConstraints
=
count
;
if
(
count
||
true
){
CUDAStream
<
int
>*
psNonShakeID
=
new
CUDAStream
<
int
>
(
count
,
1
);
gpu
->
psNonShakeID
=
psNonShakeID
;
gpu
->
sim
.
pNonShakeID
=
psNonShakeID
->
_pDevStream
[
0
];
gpu
->
sim
.
nonshake_threads_per_block
=
(
count
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
nonshake_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
nonshake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
nonshake_threads_per_block
<
1
)
gpu
->
sim
.
nonshake_threads_per_block
=
1
;
// load indices
count
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
){
if
(
constraintCount
[
i
]
==
0
){
psNonShakeID
->
_pSysStream
[
0
][
count
++
]
=
i
;
}
}
psNonShakeID
->
Upload
();
}
else
{
gpu
->
sim
.
nonshake_threads_per_block
=
0
;
}
#endif
}
extern
"C"
int
gpuAllocateInitialBuffers
(
gpuContext
gpu
)
{
gpu
->
sim
.
atoms
=
gpu
->
natoms
;
gpu
->
sim
.
paddedNumberOfAtoms
=
((
gpu
->
sim
.
atoms
+
GRID
-
1
)
>>
GRIDBITS
)
<<
GRIDBITS
;
gpu
->
sim
.
degreesOfFreedom
=
3
*
gpu
->
sim
.
atoms
-
6
;
gpu
->
gpAtomTable
=
NULL
;
gpu
->
gAtomTypes
=
0
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
;
gpu
->
sim
.
totalNonbondOutputBuffers
=
2
*
gpu
->
sim
.
nonbondOutputBuffers
;
gpu
->
sim
.
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
gpu
->
psPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
gpu
->
sim
.
stride
*
2
;
gpu
->
sim
.
stride3
=
gpu
->
sim
.
stride
*
3
;
gpu
->
sim
.
stride4
=
gpu
->
sim
.
stride
*
4
;
gpu
->
sim
.
pPosq
=
gpu
->
psPosq4
->
_pDevStream
[
0
];
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
2
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
stride3
=
3
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
stride4
=
4
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
exclusionStride
=
gpu
->
sim
.
stride
/
GRID
;
gpu
->
psPosqP4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pPosqP
=
gpu
->
psPosqP4
->
_pDevStream
[
0
];
gpu
->
psOldPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pOldPosq
=
gpu
->
psOldPosq4
->
_pDevStream
[
0
];
gpu
->
psVelm4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pVelm4
=
gpu
->
psVelm4
->
_pDevStream
[
0
];
gpu
->
psvVector4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pvVector4
=
gpu
->
psvVector4
->
_pDevStream
[
0
];
gpu
->
psxVector4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pxVector4
=
gpu
->
psxVector4
->
_pDevStream
[
0
];
gpu
->
psBornRadii
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pBornRadii
=
gpu
->
psBornRadii
->
_pDevStream
[
0
];
gpu
->
psObcChain
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcChain
=
gpu
->
psObcChain
->
_pDevStream
[
0
];
gpu
->
psSigEps2
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pAttr
=
gpu
->
psSigEps2
->
_pDevStream
[
0
];
gpu
->
psObcData
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcData
=
gpu
->
psObcData
->
_pDevStream
[
0
];
gpu
->
pAtomSymbol
=
new
unsigned
char
[
gpu
->
natoms
];
// Determine randoms
gpu
->
seed
=
(
unsigned
long
)
time
(
NULL
)
&
0x000fffff
;
gpu
->
sim
.
randomFrames
=
995
;
gpu
->
sim
.
randomIterations
=
gpu
->
sim
.
randomFrames
;
gpu
->
sim
.
randoms
=
gpu
->
sim
.
randomFrames
*
gpu
->
sim
.
paddedNumberOfAtoms
-
5
*
GRID
;
gpu
->
sim
.
totalRandoms
=
gpu
->
sim
.
randoms
+
gpu
->
sim
.
paddedNumberOfAtoms
;
gpu
->
sim
.
totalRandomsTimesTwo
=
gpu
->
sim
.
totalRandoms
*
2
;
gpu
->
psRandom4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
totalRandomsTimesTwo
,
1
);
gpu
->
psRandom2
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
totalRandomsTimesTwo
,
1
);
gpu
->
psRandomPosition
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
blocks
,
1
);
gpu
->
psRandomSeed
=
new
CUDAStream
<
uint4
>
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
,
1
);
gpu
->
sim
.
pRandom4a
=
gpu
->
psRandom4
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandom2a
=
gpu
->
psRandom2
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandom4b
=
gpu
->
psRandom4
->
_pDevStream
[
0
]
+
gpu
->
sim
.
totalRandoms
;
gpu
->
sim
.
pRandom2b
=
gpu
->
psRandom2
->
_pDevStream
[
0
]
+
gpu
->
sim
.
totalRandoms
;
gpu
->
sim
.
pRandomPosition
=
gpu
->
psRandomPosition
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandomSeed
=
gpu
->
psRandomSeed
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psRandomPosition
->
_pSysStream
[
0
][
i
]
=
0
;
}
int
seed
=
gpu
->
seed
|
((
gpu
->
seed
^
0xffffffff
)
<<
16
);
srand
(
seed
);
for
(
int
i
=
0
;
i
<
(
int
)
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
);
i
++
)
{
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
x
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
y
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
z
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
w
=
rand
();
}
float
randomValue
=
0.0
f
;
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
totalRandomsTimesTwo
;
i
++
)
{
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
x
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
y
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
z
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
w
=
randomValue
;
gpu
->
psRandom2
->
_pSysStream
[
0
][
i
].
x
=
randomValue
;
gpu
->
psRandom2
->
_pSysStream
[
0
][
i
].
y
=
randomValue
;
}
gpu
->
psRandomSeed
->
Upload
();
gpu
->
psRandom4
->
Upload
();
gpu
->
psRandom2
->
Upload
();
gpu
->
psRandomPosition
->
Upload
();
// Allocate and clear linear momentum buffer
gpu
->
psLinearMomentum
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
blocks
,
1
);
gpu
->
sim
.
pLinearMomentum
=
gpu
->
psLinearMomentum
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
}
gpu
->
psLinearMomentum
->
Upload
();
return
1
;
}
extern
"C"
void
gpuReadCoordinates
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
gpu
->
natoms
=
0
;
char
buff
[
512
];
infile
>>
buff
>>
gpu
->
natoms
;
infile
.
getline
(
buff
,
511
);
float
totalMass
=
0.0
f
;
gpuAllocateInitialBuffers
(
gpu
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
// Accumulate mass
totalMass
+=
1.0
f
/
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
;
}
gpu
->
sim
.
inverseTotalMass
=
1.0
f
/
totalMass
;
gpu
->
psPosq4
->
Upload
();
gpu
->
psVelm4
->
Upload
();
gpu
->
psxVector4
->
Upload
();
}
extern
"C"
void
gpuSetPositions
(
gpuContext
gpu
,
const
vector
<
float
>&
x
,
const
vector
<
float
>&
y
,
const
vector
<
float
>&
z
)
{
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
=
x
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
=
y
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
=
z
[
i
];
}
gpu
->
psPosq4
->
Upload
();
// set flag to recalculate Born radii
gpu
->
bRecalculateBornRadii
=
true
;
}
extern
"C"
void
gpuSetVelocities
(
gpuContext
gpu
,
const
vector
<
float
>&
x
,
const
vector
<
float
>&
y
,
const
vector
<
float
>&
z
)
{
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
=
x
[
i
];
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
=
y
[
i
];
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
=
z
[
i
];
}
gpu
->
psVelm4
->
Upload
();
}
extern
"C"
void
gpuSetMass
(
gpuContext
gpu
,
const
vector
<
float
>&
mass
)
{
float
totalMass
=
0.0
f
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
=
1.0
f
/
mass
[
i
];
totalMass
+=
mass
[
i
];
}
gpu
->
sim
.
inverseTotalMass
=
1.0
f
/
totalMass
;
gpu
->
psVelm4
->
Upload
();
}
extern
"C"
void
gpuInitializeRandoms
(
gpuContext
gpu
)
{
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psRandomPosition
->
_pSysStream
[
0
][
i
]
=
0
;
}
int
seed
=
gpu
->
seed
|
((
gpu
->
seed
^
0xffffffff
)
<<
16
);
srand
(
seed
);
for
(
int
i
=
0
;
i
<
(
int
)
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
);
i
++
)
{
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
x
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
y
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
z
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
w
=
rand
();
}
gpu
->
psRandomPosition
->
Upload
();
gpu
->
psRandomSeed
->
Upload
();
gpuSetConstants
(
gpu
);
kGenerateRandoms
(
gpu
);
return
;
}
extern
"C"
bool
gpuIsAvailable
()
{
int
deviceCount
;
cudaGetDeviceCount
(
&
deviceCount
);
return
(
deviceCount
>
0
);
}
extern
"C"
void
*
gpuInitFromFile
(
char
*
fname
)
{
ifstream
infile
(
fname
);
int
numAtoms
=
0
;
char
buff
[
512
];
infile
>>
buff
>>
numAtoms
;
gpuContext
gpu
=
(
gpuContext
)
gpuInit
(
numAtoms
);
vector
<
float
>
x
(
numAtoms
),
y
(
numAtoms
),
z
(
numAtoms
),
charge
(
numAtoms
),
vx
(
numAtoms
),
vy
(
numAtoms
),
vz
(
numAtoms
),
mass
(
numAtoms
);
infile
.
getline
(
buff
,
511
);
float
totalMass
=
0.0
f
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
x
[
i
]
>>
y
[
i
]
>>
z
[
i
]
>>
charge
[
i
]
>>
vx
[
i
]
>>
vy
[
i
]
>>
vz
[
i
]
>>
mass
[
i
];
mass
[
i
]
=
1.0
f
/
mass
[
i
];
}
gpuSetPositions
(
gpu
,
x
,
y
,
z
);
gpuSetVelocities
(
gpu
,
vx
,
vy
,
vz
);
gpuSetMass
(
gpu
,
mass
);
return
(
void
*
)
gpu
;
}
extern
"C"
void
*
gpuInit
(
int
numAtoms
)
{
gpuContext
gpu
=
new
_gpuContext
;
int
LRFSize
=
0
;
int
SMCount
=
0
;
int
SMMajor
=
0
;
int
SMMinor
=
0
;
// Get adapter
unsigned
int
device
=
0
;
char
*
pAdapter
;
pAdapter
=
getenv
(
"NV_FAH_DEVICE"
);
if
(
pAdapter
!=
NULL
)
{
sscanf
(
pAdapter
,
"%d"
,
&
device
);
}
cudaError_t
status
=
cudaSetDevice
(
device
);
RTERROR
(
status
,
"Error setting CUDA device"
)
// Determine which core to run on
#if 0
SYSTEM_INFO info;
GetSystemInfo(&info);
unsigned int cores = info.dwNumberOfProcessors;
if (cores > 1)
{
HANDLE hproc = GetCurrentProcess();
unsigned int core = (cores - 1) - (device % (cores - 1));
unsigned int mask = 1 << core;
SetProcessAffinityMask(hproc, mask);
}
#endif
// Determine kernel call configuration
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
0
);
// Determine SM version
if
(
deviceProp
.
major
==
1
)
{
switch
(
deviceProp
.
minor
)
{
case
0
:
case
1
:
gpu
->
sm_version
=
SM_10
;
gpu
->
sim
.
workUnitsPerSM
=
G8X_NONBOND_WORKUNITS_PER_SM
;
break
;
default:
gpu
->
sm_version
=
SM_12
;
gpu
->
sim
.
workUnitsPerSM
=
GT2XX_NONBOND_WORKUNITS_PER_SM
;
break
;
}
}
gpu
->
sim
.
nonbond_blocks
=
deviceProp
.
multiProcessorCount
;
gpu
->
sim
.
bornForce2_blocks
=
deviceProp
.
multiProcessorCount
;
gpu
->
sim
.
blocks
=
deviceProp
.
multiProcessorCount
;
if
(
deviceProp
.
regsPerBlock
==
8192
)
{
gpu
->
sim
.
nonbond_threads_per_block
=
G8X_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
bornForce2_threads_per_block
=
G8X_BORNFORCE2_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_shake_threads_per_block
=
G8X_SHAKE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_update_threads_per_block
=
G8X_UPDATE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_localForces_threads_per_block
=
G8X_LOCALFORCES_THREADS_PER_BLOCK
;
gpu
->
sim
.
threads_per_block
=
G8X_THREADS_PER_BLOCK
;
gpu
->
sim
.
random_threads_per_block
=
G8X_RANDOM_THREADS_PER_BLOCK
;
}
else
{
gpu
->
sim
.
nonbond_threads_per_block
=
GT2XX_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
bornForce2_threads_per_block
=
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_shake_threads_per_block
=
GT2XX_SHAKE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_update_threads_per_block
=
GT2XX_UPDATE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_localForces_threads_per_block
=
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
;
gpu
->
sim
.
threads_per_block
=
GT2XX_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
random_threads_per_block
=
GT2XX_RANDOM_THREADS_PER_BLOCK
;
}
gpu
->
sim
.
shake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
gpu
->
sim
.
localForces_threads_per_block
=
gpu
->
sim
.
max_localForces_threads_per_block
;
gpu
->
natoms
=
numAtoms
;
gpuAllocateInitialBuffers
(
gpu
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
}
gpu
->
psxVector4
->
Upload
();
gpu
->
iterations
=
0
;
gpu
->
sim
.
update_threads_per_block
=
(
gpu
->
natoms
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
update_threads_per_block
>
gpu
->
sim
.
max_update_threads_per_block
)
gpu
->
sim
.
update_threads_per_block
=
gpu
->
sim
.
max_update_threads_per_block
;
if
(
gpu
->
sim
.
update_threads_per_block
<
1
)
gpu
->
sim
.
update_threads_per_block
=
1
;
gpu
->
sim
.
bf_reduce_threads_per_block
=
gpu
->
sim
.
update_threads_per_block
;
gpu
->
sim
.
bsf_reduce_threads_per_block
=
(
gpu
->
sim
.
stride4
+
gpu
->
natoms
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
gpu
->
sim
.
bsf_reduce_threads_per_block
=
((
gpu
->
sim
.
bsf_reduce_threads_per_block
+
(
GRID
-
1
))
/
GRID
)
*
GRID
;
if
(
gpu
->
sim
.
bsf_reduce_threads_per_block
>
gpu
->
sim
.
threads_per_block
)
gpu
->
sim
.
bsf_reduce_threads_per_block
=
gpu
->
sim
.
threads_per_block
;
if
(
gpu
->
sim
.
bsf_reduce_threads_per_block
<
1
)
gpu
->
sim
.
bsf_reduce_threads_per_block
=
1
;
// Initialize constants to reasonable values
gpu
->
sim
.
probeRadius
=
probeRadius
;
gpu
->
sim
.
surfaceAreaFactor
=
surfaceAreaFactor
;
gpu
->
sim
.
electricConstant
=
electricConstant
;
gpu
->
sim
.
bigFloat
=
99999999.0
f
;
gpu
->
sim
.
forceConversionFactor
=
forceConversionFactor
;
gpu
->
sim
.
preFactor
=
2.0
f
*
electricConstant
*
((
1.0
f
/
defaultInnerDielectric
)
-
(
1.0
f
/
defaultSolventDielectric
))
*
gpu
->
sim
.
forceConversionFactor
;
gpu
->
sim
.
dielectricOffset
=
dielectricOffset
;
gpu
->
sim
.
alphaOBC
=
alphaOBC
;
gpu
->
sim
.
betaOBC
=
betaOBC
;
gpu
->
sim
.
gammaOBC
=
gammaOBC
;
gpuSetIntegrationParameters
(
gpu
,
1.0
f
,
2.0e-3
f
,
300.0
f
);
gpu
->
sim
.
maxShakeIterations
=
15
;
gpu
->
sim
.
shakeTolerance
=
1.0e-04
f
*
2.0
f
;
gpu
->
sim
.
InvMassJ
=
9.920635e-001
f
;
gpu
->
grid
=
GRID
;
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
false
;
gpu
->
bRecalculateBornRadii
=
true
;
gpuInitializeRandoms
(
gpu
);
// To be determined later
gpu
->
psLJ14ID
=
NULL
;
gpu
->
psForce4
=
NULL
;
gpu
->
sim
.
pForce4
=
NULL
;
gpu
->
sim
.
pForce4a
=
NULL
;
gpu
->
sim
.
pForce4b
=
NULL
;
gpu
->
psBornForce
=
NULL
;
gpu
->
sim
.
pBornForce
=
NULL
;
gpu
->
psBornSum
=
NULL
;
gpu
->
sim
.
pBornSum
=
NULL
;
gpu
->
psBondID
=
NULL
;
gpu
->
psBondParameter
=
NULL
;
gpu
->
psBondAngleID1
=
NULL
;
gpu
->
psBondAngleID2
=
NULL
;
gpu
->
psBondAngleParameter
=
NULL
;
gpu
->
psDihedralID1
=
NULL
;
gpu
->
psDihedralID2
=
NULL
;
gpu
->
psDihedralParameter
=
NULL
;
gpu
->
psRbDihedralID1
=
NULL
;
gpu
->
psRbDihedralID2
=
NULL
;
gpu
->
psRbDihedralParameter1
=
NULL
;
gpu
->
psRbDihedralParameter2
=
NULL
;
gpu
->
psLJ14ID
=
NULL
;
gpu
->
psLJ14Parameter
=
NULL
;
gpu
->
psShakeID
=
NULL
;
gpu
->
psShakeParameter
=
NULL
;
gpu
->
psExclusion
=
NULL
;
gpu
->
psWorkUnit
=
NULL
;
// Initialize output buffer before reading parameters
gpu
->
pOutputBufferCounter
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
];
memset
(
gpu
->
pOutputBufferCounter
,
0
,
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
unsigned
int
));
// Initialize exclusion array
gpu
->
pExclusion
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
];
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
gpu
->
pExclusion
[
i
]
=
1
;
return
(
void
*
)
gpu
;
}
extern
"C"
void
gpuSetIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
gpu
->
sim
.
tau
=
tau
;
gpu
->
sim
.
GDT
=
gpu
->
sim
.
deltaT
/
gpu
->
sim
.
tau
;
gpu
->
sim
.
EPH
=
exp
(
0.5
f
*
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EMH
=
exp
(
-
0.5
f
*
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EP
=
exp
(
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EM
=
exp
(
-
gpu
->
sim
.
GDT
);
gpu
->
sim
.
OneMinusEM
=
1.0
f
-
gpu
->
sim
.
EM
;
gpu
->
sim
.
TauOneMinusEM
=
gpu
->
sim
.
tau
*
gpu
->
sim
.
OneMinusEM
;
if
(
gpu
->
sim
.
GDT
>=
0.1
f
)
{
float
term1
=
gpu
->
sim
.
EPH
-
1.0
f
;
term1
*=
term1
;
gpu
->
sim
.
B
=
gpu
->
sim
.
GDT
*
(
gpu
->
sim
.
EP
-
1.0
f
)
-
4.0
f
*
term1
;
gpu
->
sim
.
C
=
gpu
->
sim
.
GDT
-
3.0
f
+
4.0
f
*
gpu
->
sim
.
EMH
-
gpu
->
sim
.
EM
;
gpu
->
sim
.
D
=
2.0
f
-
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
;
}
else
{
float
term1
=
0.5
f
*
gpu
->
sim
.
GDT
;
float
term2
=
term1
*
term1
;
float
term4
=
term2
*
term2
;
float
third
=
1.0
f
/
3.0
f
;
float
o7_9
=
7.0
f
/
9.0
f
;
float
o1_12
=
1.0
f
/
12.0
f
;
float
o17_90
=
17.0
f
/
90.0
f
;
float
o7_30
=
7.0
f
/
30.0
f
;
float
o31_1260
=
31.0
f
/
1260.0
f
;
float
o_360
=
1.0
f
/
360.0
f
;
gpu
->
sim
.
B
=
term4
*
(
third
+
term1
*
(
third
+
term1
*
(
o17_90
+
term1
*
o7_9
)));
gpu
->
sim
.
C
=
term2
*
term1
*
(
2.0
f
*
third
+
term1
*
(
-
0.5
f
+
term1
*
(
o7_30
+
term1
*
(
-
o1_12
+
term1
*
o31_1260
))));
gpu
->
sim
.
D
=
term2
*
(
-
1.0
f
+
term2
*
(
-
o1_12
-
term2
*
o_360
));
}
gpu
->
sim
.
TauDOverEMMinusOne
=
gpu
->
sim
.
tau
*
gpu
->
sim
.
D
/
(
gpu
->
sim
.
EM
-
1.0
f
);
gpu
->
sim
.
DOverTauC
=
gpu
->
sim
.
D
/
(
gpu
->
sim
.
tau
*
gpu
->
sim
.
C
);
gpu
->
sim
.
fix1
=
gpu
->
sim
.
tau
*
(
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
);
gpu
->
sim
.
oneOverFix1
=
1.0
f
/
(
gpu
->
sim
.
tau
*
(
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
));
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
V
=
sqrt
(
gpu
->
sim
.
kT
*
(
1.0
f
-
gpu
->
sim
.
EM
));
gpu
->
sim
.
X
=
gpu
->
sim
.
tau
*
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
C
);
gpu
->
sim
.
Yv
=
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
B
/
gpu
->
sim
.
C
);
gpu
->
sim
.
Yx
=
gpu
->
sim
.
tau
*
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
B
/
(
1.0
f
-
gpu
->
sim
.
EM
));
}
extern
"C"
void
gpuSetVerletIntegrationParameters
(
gpuContext
gpu
,
float
deltaT
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
}
extern
"C"
void
gpuSetBrownianIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
gpu
->
sim
.
tau
=
tau
;
gpu
->
sim
.
GDT
=
gpu
->
sim
.
deltaT
*
gpu
->
sim
.
tau
;
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
Yv
=
gpu
->
sim
.
Yx
=
sqrt
(
2.0
f
*
gpu
->
sim
.
kT
*
deltaT
*
tau
);
}
extern
"C"
void
gpuSetAndersenThermostatParameters
(
gpuContext
gpu
,
float
temperature
,
float
collisionProbability
)
{
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
collisionProbability
=
collisionProbability
;
gpu
->
sim
.
Yv
=
gpu
->
sim
.
Yx
=
1.0
f
;
gpu
->
sim
.
V
=
gpu
->
sim
.
X
=
1.0
f
;
}
extern
"C"
void
gpuShutDown
(
gpuContext
gpu
)
{
// Delete sysmem pointers
delete
[]
gpu
->
pOutputBufferCounter
;
delete
[]
gpu
->
pExclusion
;
delete
[]
gpu
->
gpAtomTable
;
delete
[]
gpu
->
pAtomSymbol
;
// Delete device pointers
delete
gpu
->
psPosq4
;
delete
gpu
->
psPosqP4
;
delete
gpu
->
psOldPosq4
;
delete
gpu
->
psVelm4
;
delete
gpu
->
psForce4
;
delete
gpu
->
psxVector4
;
delete
gpu
->
psvVector4
;
delete
gpu
->
psSigEps2
;
delete
gpu
->
psObcData
;
delete
gpu
->
psObcChain
;
delete
gpu
->
psBornForce
;
delete
gpu
->
psBornRadii
;
delete
gpu
->
psBornSum
;
delete
gpu
->
psBondID
;
delete
gpu
->
psBondParameter
;
delete
gpu
->
psBondAngleID1
;
delete
gpu
->
psBondAngleID2
;
delete
gpu
->
psBondAngleParameter
;
delete
gpu
->
psDihedralID1
;
delete
gpu
->
psDihedralID2
;
delete
gpu
->
psDihedralParameter
;
delete
gpu
->
psRbDihedralID1
;
delete
gpu
->
psRbDihedralID2
;
delete
gpu
->
psRbDihedralParameter1
;
delete
gpu
->
psRbDihedralParameter2
;
delete
gpu
->
psLJ14ID
;
delete
gpu
->
psLJ14Parameter
;
delete
gpu
->
psShakeID
;
delete
gpu
->
psShakeParameter
;
delete
gpu
->
psExclusion
;
delete
gpu
->
psWorkUnit
;
delete
gpu
->
psRandom4
;
delete
gpu
->
psRandom2
;
delete
gpu
->
psRandomPosition
;
delete
gpu
->
psRandomSeed
;
delete
gpu
->
psLinearMomentum
;
// Wrap up
delete
gpu
;
return
;
}
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
)
{
unsigned
int
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
if
(
outputBuffers
<
gpu
->
pOutputBufferCounter
[
i
])
{
outputBuffers
=
gpu
->
pOutputBufferCounter
[
i
];
}
}
gpu
->
sim
.
outputBuffers
=
outputBuffers
;
gpu
->
psForce4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
outputBuffers
);
gpu
->
psBornForce
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
psBornSum
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
sim
.
pForce4
=
gpu
->
psForce4
->
_pDevStream
[
0
];
gpu
->
sim
.
pForce4a
=
gpu
->
sim
.
pForce4
;
gpu
->
sim
.
pForce4b
=
gpu
->
sim
.
pForce4
+
1
*
gpu
->
sim
.
nonbondOutputBuffers
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
pBornForce
=
gpu
->
psBornForce
->
_pDevStream
[
0
];
gpu
->
sim
.
pBornSum
=
gpu
->
psBornSum
->
_pDevStream
[
0
];
// Determine local energy paramter offsets for bonded interactions
gpu
->
sim
.
bond_offset
=
gpu
->
psBondParameter
->
_stride
;
gpu
->
sim
.
bond_angle_offset
=
gpu
->
sim
.
bond_offset
+
gpu
->
psBondAngleParameter
->
_stride
;
gpu
->
sim
.
dihedral_offset
=
gpu
->
sim
.
bond_angle_offset
+
gpu
->
psDihedralParameter
->
_stride
;
gpu
->
sim
.
rb_dihedral_offset
=
gpu
->
sim
.
dihedral_offset
+
gpu
->
psRbDihedralParameter1
->
_stride
;
gpu
->
sim
.
LJ14_offset
=
gpu
->
sim
.
rb_dihedral_offset
+
gpu
->
psLJ14Parameter
->
_stride
;
gpu
->
sim
.
localForces_threads_per_block
=
(
gpu
->
sim
.
LJ14_offset
/
gpu
->
sim
.
blocks
+
15
)
&
0xfffffff0
;
if
(
gpu
->
sim
.
localForces_threads_per_block
>
gpu
->
sim
.
max_localForces_threads_per_block
)
gpu
->
sim
.
localForces_threads_per_block
=
gpu
->
sim
.
max_localForces_threads_per_block
;
if
(
gpu
->
sim
.
localForces_threads_per_block
<
1
)
gpu
->
sim
.
localForces_threads_per_block
=
1
;
// Flip local force output buffers
int
flip
=
outputBuffers
-
1
;
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
bonds
;
i
++
)
{
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
bond_angles
;
i
++
)
{
gpu
->
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
;
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
dihedrals
;
i
++
)
{
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
rb_dihedrals
;
i
++
)
{
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
LJ14s
;
i
++
)
{
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
;
}
gpu
->
psBondID
->
Upload
();
gpu
->
psBondAngleID1
->
Upload
();
gpu
->
psBondAngleID2
->
Upload
();
gpu
->
psDihedralID2
->
Upload
();
gpu
->
psRbDihedralID2
->
Upload
();
gpu
->
psLJ14ID
->
Upload
();
return
1
;
}
extern
"C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
)
{
const
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
const
unsigned
int
cells
=
dim
*
(
dim
+
1
)
/
2
;
const
unsigned
int
*
pExclusion
=
gpu
->
pExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
unsigned
int
*
pWorkList
=
psWorkUnit
->
_pSysStream
[
0
];
gpu
->
psWorkUnit
=
psWorkUnit
;
gpu
->
sim
.
pWorkUnit
=
psWorkUnit
->
_pDevStream
[
0
];
gpu
->
sim
.
nonbond_workBlock
=
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
workUnits
=
cells
;
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
int
minimumBlocks
=
(
cells
+
gpu
->
sim
.
workUnitsPerSM
-
1
)
/
gpu
->
sim
.
workUnitsPerSM
;
if
((
int
)
gpu
->
sim
.
nonbond_blocks
<
minimumBlocks
)
{
gpu
->
sim
.
nonbond_blocks
=
gpu
->
sim
.
nonbond_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
nonbond_blocks
-
1
)
/
gpu
->
sim
.
nonbond_blocks
);
}
if
((
int
)
gpu
->
sim
.
bornForce2_blocks
<
minimumBlocks
)
{
gpu
->
sim
.
bornForce2_blocks
=
gpu
->
sim
.
bornForce2_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
bornForce2_blocks
-
1
)
/
gpu
->
sim
.
bornForce2_blocks
);
}
gpu
->
sim
.
nbWorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nbWorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nbWorkUnitsPerBlock
;
gpu
->
sim
.
bf2WorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bf2WorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bf2WorkUnitsPerBlock
;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
int
activeWorkUnits
=
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nonbond_workBlock
;
if
(
activeWorkUnits
>
(
int
)
cells
)
{
int
balancedWorkBlock
=
(
cells
+
gpu
->
sim
.
nonbond_blocks
-
1
)
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nonbond_threads_per_block
=
balancedWorkBlock
*
GRID
;
gpu
->
sim
.
nonbond_workBlock
=
balancedWorkBlock
;
}
activeWorkUnits
=
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bornForce2_workBlock
;
if
(
activeWorkUnits
>
(
int
)
cells
)
{
int
balancedWorkBlock
=
(
cells
+
gpu
->
sim
.
bornForce2_blocks
-
1
)
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bornForce2_threads_per_block
=
balancedWorkBlock
*
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
balancedWorkBlock
;
}
unsigned
int
count
=
0
;
for
(
unsigned
int
y
=
0
;
y
<
dim
;
y
++
)
{
for
(
unsigned
int
x
=
y
;
x
<
dim
;
x
++
)
{
pWorkList
[
count
]
=
(
x
<<
17
)
|
(
y
<<
2
);
// Check for exclusions
int
exclusions
=
0
;
for
(
unsigned
int
i
=
y
*
grid
;
i
<
y
*
grid
+
grid
;
i
++
)
{
for
(
unsigned
int
j
=
x
*
grid
;
j
<
x
*
grid
+
grid
;
j
++
)
{
if
(
!
pExclusion
[
i
*
atoms
+
j
])
{
exclusions
++
;
}
}
}
// Signal exclusions if they exist
if
(
exclusions
>
0
)
pWorkList
[
count
]
|=
0x1
;
count
++
;
}
}
psWorkUnit
->
Upload
();
gpuSetConstants
(
gpu
);
return
cells
;
}
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
)
{
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
unsigned
int
>*
psExclusion
=
new
CUDAStream
<
unsigned
int
>
(
atoms
*
atoms
,
1u
);
gpu
->
psExclusion
=
psExclusion
;
gpu
->
sim
.
pExclusion
=
psExclusion
->
_pDevStream
[
0
];
unsigned
int
*
pExList
=
psExclusion
->
_pSysStream
[
0
];
int
exclusions
=
0
;
unsigned
int
pos
=
0
;
for
(
unsigned
int
x
=
0
;
x
<
atoms
;
x
+=
gpu
->
grid
)
{
for
(
unsigned
int
y
=
0
;
y
<
atoms
;
y
+=
gpu
->
grid
)
{
for
(
unsigned
x1
=
x
;
x1
<
x
+
gpu
->
grid
;
x1
++
)
{
unsigned
int
mask
=
0
;
for
(
unsigned
int
y1
=
y
;
y1
<
y
+
gpu
->
grid
;
y1
++
)
{
mask
>>=
1
;
if
(
gpu
->
pExclusion
[
x1
*
atoms
+
y1
]
==
0
)
{
if
(
x1
>=
y1
)
exclusions
++
;
}
else
mask
|=
0x80000000
;
}
pExList
[
pos
++
]
=
mask
;
}
}
}
psExclusion
->
Upload
();
gpuSetConstants
(
gpu
);
return
exclusions
;
}
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
)
{
SetCalculateCDLJForcesSim
(
gpu
);
SetCalculateCDLJObcGbsaForces1Sim
(
gpu
);
SetCalculateLocalForcesSim
(
gpu
);
SetCalculateObcGbsaBornSumSim
(
gpu
);
SetCalculateObcGbsaForces1Sim
(
gpu
);
SetCalculateObcGbsaForces2Sim
(
gpu
);
SetCalculateAndersenThermostatSim
(
gpu
);
SetForcesSim
(
gpu
);
SetUpdateShakeHSim
(
gpu
);
SetVerletUpdateSim
(
gpu
);
SetBrownianUpdateSim
(
gpu
);
SetRandomSim
(
gpu
);
if
(
gpu
->
sm_version
>=
SM_12
)
{
SetCalculateCDLJForces_12Sim
(
gpu
);
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces2_12Sim
(
gpu
);
}
return
1
;
}
extern
"C"
void
gpuDumpCoordinates
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
(
void
)
printf
(
"
\n\n
Coordinates and velocities
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
);
}
}
bool
ISNAN
(
float
f
)
{
return
!
(
f
==
f
);
}
extern
"C"
bool
gpuCheckData
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
gpu
->
psForce4
->
Download
();
gpu
->
psBornForce
->
Download
();
int
violations
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
if
(
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]))
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
);
violations
++
;
}
}
if
(
violations
>
0
)
{
printf
(
"%d total violations
\n
"
,
violations
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
float
dmin
=
99999999.0
f
;
int
closest
=
-
9999
;
float
x
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
;
float
y
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
;
float
z
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
;
for
(
int
j
=
0
;
j
<
gpu
->
natoms
;
j
++
)
{
if
(
j
!=
i
)
{
float
dx
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
x
-
x
;
float
dy
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
y
-
y
;
float
dz
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
z
-
z
;
float
r
=
sqrt
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
if
(
r
<
dmin
)
{
dmin
=
r
;
closest
=
j
;
}
}
}
printf
(
"Atom %4d: Closest neighbor is Atom %4d, %11.5e
\n
"
,
i
,
closest
,
dmin
);
}
gpuDumpAtomData
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCPUCalculateLocalForces
(
gpu
);
// Determine which forces have gone awry
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateCDLJForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Nonbond Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateObcGbsaForces1
(
gpu
);
kReduceObcGbsaBornForces
(
gpu
);
kCalculateObcGbsaForces2
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"OBC Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateLocalForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Local Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Cleared Forces
\n
"
);
gpuDumpForces
(
gpu
);
return
false
;
}
return
true
;
}
extern
"C"
void
kCPUCalculate14
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psForce4
->
Download
();
// gpu->psLJ14ID->Download();
// gpu->psLJ14Parameter->Download();
for
(
int
pos
=
0
;
pos
<
(
int
)
gpu
->
sim
.
LJ14s
;
pos
++
)
{
int4
atom
=
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
pos
];
float4
LJ14
=
gpu
->
psLJ14Parameter
->
_pSysStream
[
0
][
pos
];
float4
a1
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
x
];
float4
a2
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
d
.
x
*
d
.
x
+
d
.
y
*
d
.
y
+
d
.
z
*
d
.
z
;
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
gpu
->
sim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
gpu
->
sim
.
stride
;
float4
forceA
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
];
float4
forceB
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
]
=
forceA
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
]
=
forceB
;
printf
(
"%4d: %4d - %4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
pos
,
atom
.
x
,
atom
.
y
,
r2
,
dEdR
,
sig2
,
sig6
,
LJ14
.
x
,
LJ14
.
z
);
}
}
extern
"C"
void
gpuDumpPrimeCoordinates
(
gpuContext
gpu
)
{
gpu
->
psPosqP4
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
w
);
}
}
extern
"C"
void
gpuDumpForces
(
gpuContext
gpu
)
{
gpu
->
psForce4
->
Download
();
gpu
->
psBornForce
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
char
buff
[
512
];
sprintf
(
buff
,
"%4d: %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
);
// OutputDebugString(buff);
}
}
extern
"C"
void
gpuDumpAtomData
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psSigEps2
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psObcChain
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
char
buff
[
512
];
sprintf
(
buff
,
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]
);
// OutputDebugString((LPCWSTR)buff);
}
}
extern
"C"
void
gpuSetup
(
void
*
pVoid
)
{
gpuContext
gpu
=
(
gpuContext
)
pVoid
;
// Read parameters
cout
<<
gpuReadAtomicParameters
(
gpu
,
"Data/atomicradii.txt"
)
<<
" atom types
\n
"
;
cout
<<
gpuReadBondParameters
(
gpu
,
"Data/GromacsHarmonicBondParameter.txt"
)
<<
" bond parameters.
\n
"
;
cout
<<
gpuReadBondAngleParameters
(
gpu
,
"Data/GromacsAngleBondParameter.txt"
)
<<
" bond angle parameters.
\n
"
;
cout
<<
gpuReadDihedralParameters
(
gpu
,
"Data/GromacsProperDihedralParameter.txt"
)
<<
" proper dihedral parameters.
\n
"
;
cout
<<
gpuReadRbDihedralParameters
(
gpu
,
"Data/GromacsRbDihedralParameter.txt"
)
<<
" Ryckaert-Bellemans dihedral parameters.
\n
"
;
cout
<<
gpuReadLJ14Parameters
(
gpu
,
"Data/GromacsLJ14Parameter.txt"
)
<<
" Lennard-Jones 1-4 parameters.
\n
"
;
cout
<<
gpuReadCoulombParameters
(
gpu
,
"Data/GromacsLJCoulombParameter.txt"
)
<<
" Coulomb parameters.
\n
"
;
cout
<<
gpuReadShakeParameters
(
gpu
,
"Data/GromacsShakeParameters.txt"
)
<<
" shake parameters.
\n
"
;
// Build thread block work list
gpuBuildThreadBlockWorkList
(
gpu
);
// Build exclusion list
gpuBuildExclusionList
(
gpu
);
// Create output buffers
gpuBuildOutputBuffers
(
gpu
);
// Set constant blocks
gpuSetConstants
(
gpu
);
// Initialize randoms
gpuInitializeRandoms
(
gpu
);
// Initialize Born Radii;
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
kClearForces
(
gpu
);
kClearBornForces
(
gpu
);
return
;
}
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{
\
dp
=
DOT3
(
v1
,
v2
);
\
float
norm1
=
DOT3
(
v1
,
v1
);
\
float
norm2
=
DOT3
(
v2
,
v2
);
\
dp
/=
sqrt
(
norm1
*
norm2
);
\
dp
=
min
(
dp
,
1.0
f
);
\
dp
=
max
(
dp
,
-
1.0
f
);
\
}
#define CROSS_PRODUCT(v1, v2, c) \
c
.
x
=
v1
.
y
*
v2
.
z
-
v1
.
z
*
v2
.
y
;
\
c
.
y
=
v1
.
z
*
v2
.
x
-
v1
.
x
*
v2
.
z
;
\
c
.
z
=
v1
.
x
*
v2
.
y
-
v1
.
y
*
v2
.
x
;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acos
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
3.14159265
f
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
;
\
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{
\
float
dp
;
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
dp
);
\
angle
=
acos
(
dp
);
\
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
cosine
);
\
angle
=
acos
(
cosine
);
\
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLECOSINEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
,
cosine
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
// Calculate Local forces on CPU
extern
"C"
void
kCPUCalculateLocalForces
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psForce4
->
Download
();
gpu
->
psBondID
->
Download
();
gpu
->
psBondParameter
->
Download
();
gpu
->
psBondAngleID1
->
Download
();
gpu
->
psBondAngleID2
->
Download
();
gpu
->
psBondAngleParameter
->
Download
();
gpu
->
psDihedralID1
->
Download
();
gpu
->
psDihedralID2
->
Download
();
gpu
->
psDihedralParameter
->
Download
();
gpu
->
psRbDihedralID1
->
Download
();
gpu
->
psRbDihedralID2
->
Download
();
gpu
->
psRbDihedralParameter1
->
Download
();
gpu
->
psRbDihedralParameter2
->
Download
();
gpu
->
psLJ14ID
->
Download
();
gpu
->
psLJ14Parameter
->
Download
();
unsigned
int
pos
=
0
;
Vectors
V
;
Vectors
*
A
=
&
V
;
int
violations
=
0
;
while
(
pos
<
gpu
->
sim
.
bond_offset
)
{
if
(
pos
<
gpu
->
sim
.
bonds
)
{
int4
atom
=
gpu
->
psBondID
->
_pSysStream
[
0
][
pos
];
float4
atomA
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
x
];
float4
atomB
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
y
];
float2
bond
=
gpu
->
psBondParameter
->
_pSysStream
[
0
][
pos
];
float
dx
=
atomB
.
x
-
atomA
.
x
;
float
dy
=
atomB
.
y
-
atomA
.
y
;
float
dz
=
atomB
.
z
-
atomA
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
float
deltaIdeal
=
r
-
bond
.
x
;
float
dEdR
=
bond
.
y
*
deltaIdeal
;
dEdR
=
(
r
>
0.0
f
)
?
(
dEdR
/
r
)
:
0.0
f
;
if
(
fabs
(
deltaIdeal
)
>
1.0
f
)
{
printf
(
"Bond %4d: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f
\n
"
,
pos
,
dx
,
dy
,
dz
,
r
,
deltaIdeal
,
dEdR
);
violations
++
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
gpu
->
sim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
gpu
->
sim
.
stride
;
float4
forceA
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
];
float4
forceB
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
]
=
forceA
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
]
=
forceB
;
}
pos
++
;
}
#if 0
while (pos < gpu->sim.bond_angle_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_offset;
if (pos1 < gpu->sim.bond_angles)
{
int4 atom1 = gpu->psBondAngleID1->_pSysStream[0][pos1];
float2 bond_angle = gpu->psBondAngleParameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 a3 = gpu->psPosq4->_pSysStream[0][atom1.z];
A->v0.x = a2.x - a1.x;
A->v0.y = a2.y - a1.y;
A->v0.z = a2.z - a1.z;
A->v1.x = a2.x - a3.x;
A->v1.y = a2.y - a3.y;
A->v1.z = a2.z - a3.z;
float3 cp;
CROSS_PRODUCT(A->v0, A->v1, cp);
float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
rp = max(sqrt(rp), 1.0e-06f);
float r21 = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = dot / sqrt(r21 * r23);
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
printf("Bond angle %4d %11.4f %11.4f\n", pos1, cosine, dEdR);
float termA = dEdR / (r21 * rp);
float termC = -dEdR / (r23 * rp);
float3 c21;
float3 c23;
CROSS_PRODUCT(A->v0, cp, c21);
CROSS_PRODUCT(A->v1, cp, c23);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
int2 atom2 = gpu->psBondAngleID2->_pSysStream[0][pos1];
unsigned int offset = atom1.x + atom1.w * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
force.x += c21.x;
force.y += c21.y;
force.z += c21.z;
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.y + atom2.x * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x -= (c21.x + c23.x);
force.y -= (c21.y + c23.y);
force.z -= (c21.z + c23.z);
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.z + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += c23.x;
force.y += c23.y;
force.z += c23.z;
gpu->psForce4->_pSysStream[0][offset] = force;
}
pos++;
}
while (pos < gpu->sim.dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_angle_offset;
if (pos1 < gpu->sim.dihedrals)
{
int4 atom1 = gpu->psDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = gpu->psDihedralParameter->_pSysStream[0][pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.rb_dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.dihedral_offset;
if (pos1 < gpu->sim.rb_dihedrals)
{
int4 atom1 = gpu->psRbDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle, cosPhi;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += 3.14159265f;
}
else
{
dihedralAngle -= 3.14159265f;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4 dihedral1 = gpu->psRbDihedralParameter1->_pSysStream[0][pos1];
float2 dihedral2 = gpu->psRbDihedralParameter2->_pSysStream[0][pos1];
float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sin(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psRbDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
printf("RB Dihedral %4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.LJ14_offset)
{
unsigned int pos1 = pos - gpu->sim.rb_dihedral_offset;
if (pos1 < gpu->sim.LJ14s)
{
int4 atom = gpu->psLJ14ID->_pSysStream[0][pos1];
float4 LJ14 = gpu->psLJ14Parameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * gpu->sim.stride;
unsigned int offsetB = atom.y + atom.w * gpu->sim.stride;
float4 forceA = gpu->psForce4->_pSysStream[0][offsetA];
float4 forceB = gpu->psForce4->_pSysStream[0][offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
printf("LJ14 %d: %11.4f %11.4f %11.4f\n", pos1, d.x, d.y, d.z);
gpu->psForce4->_pSysStream[0][offsetA] = forceA;
gpu->psForce4->_pSysStream[0][offsetB] = forceB;
}
pos++;
}
#endif
if
(
violations
>
0
)
{
gpuDumpCoordinates
(
gpu
);
gpuDumpForces
(
gpu
);
}
}
static
FILE
*
getWriteToFilePtr
(
char
*
fname
,
int
step
)
{
std
::
stringstream
fileName
;
fileName
<<
fname
<<
"_"
;
fileName
<<
step
;
fileName
<<
".txt"
;
FILE
*
filePtr
=
fopen
(
fileName
.
str
().
c_str
(),
"w"
);
if
(
filePtr
==
NULL
){
(
void
)
fprintf
(
stderr
,
"Could not open file=<%s> for writitng."
,
fileName
.
str
().
c_str
()
);
exit
(
-
1
);
}
return
filePtr
;
}
extern
"C"
{
static
void
printValues
(
FILE
*
filePtr
,
int
index
,
int
numberOfValues
,
float
*
values
)
{
int
i
;
(
void
)
fprintf
(
filePtr
,
"%5d "
,
index
);
for
(
i
=
0
;
i
<
numberOfValues
;
i
++
)
{
(
void
)
fprintf
(
filePtr
,
" %18.10e"
,
values
[
i
]
);
}
(
void
)
fprintf
(
filePtr
,
"
\n
"
);
(
void
)
fflush
(
filePtr
);
}
}
extern
"C"
void
WriteArrayToFile1
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
1
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
];
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
];
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
WriteArrayToFile2
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float2
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
2
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
WriteArrayToFile4
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float4
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
4
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
values
[
2
]
=
psPos
->
_pSysStream
[
0
][
i
].
z
;
values
[
3
]
=
psPos
->
_pSysStream
[
0
][
i
].
w
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
values
[
2
]
=
psPos
->
_pSysStream
[
0
][
i
].
z
;
values
[
3
]
=
psPos
->
_pSysStream
[
0
][
i
].
w
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
gpuDumpObcInfo
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psObcData
->
Download
();
gpu
->
psBornSum
->
Download
();
printf
(
"
\n\n
Obc Info xyzw Brad atomR scaledAtomR
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
gpu
->
psBornSum
->
_pSysStream
[
0
][
i
],
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
);
}
}
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
)
{
float
compF
;
gpu
->
psForce4
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psBornForce
->
Download
();
gpu
->
psObcChain
->
Download
();
gpu
->
psBornSum
->
Download
();
printf
(
"
\n\n
Obc F3 BrnR BrnF Chn
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
compF
=
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
/
(
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
*
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
*
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]);
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
// gpu->psForce4->_pSysStream[0][i].w,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
compF
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
],
// gpu->psBornSum->_pSysStream[0][i],
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]
);
}
}
platforms/cuda/src/kernels/gputypes.h
0 → 100755
View file @
38f6c8f8
#ifndef __GPUTYPES_H__
#define __GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include <vector>
struct
gpuAtomType
{
string
name
;
char
symbol
;
float
r
;
};
enum
SM_VERSION
{
SM_10
,
SM_11
,
SM_12
};
/* Pointer to this structure will be given
* to gromacs functions*/
struct
_gpuContext
{
//Cache this here so that it doesn't
//have to be repeatedly passed around
int
natoms
;
gpuAtomType
*
gpAtomTable
;
int
gAtomTypes
;
cudaGmxSimulation
sim
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pExclusion
;
unsigned
char
*
pAtomSymbol
;
float
iterations
;
float
epsfac
;
float
solventDielectric
;
float
soluteDielectric
;
int
grid
;
bool
bCalculateCM
;
bool
bRemoveCM
;
bool
bRecalculateBornRadii
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
CUDAStream
<
float4
>*
psVelm4
;
CUDAStream
<
float4
>*
psForce4
;
CUDAStream
<
float4
>*
psxVector4
;
CUDAStream
<
float4
>*
psvVector4
;
CUDAStream
<
float2
>*
psSigEps2
;
CUDAStream
<
float2
>*
psObcData
;
CUDAStream
<
float
>*
psObcChain
;
CUDAStream
<
float
>*
psBornForce
;
CUDAStream
<
float
>*
psBornRadii
;
CUDAStream
<
float
>*
psBornSum
;
CUDAStream
<
int4
>*
psBondID
;
CUDAStream
<
float2
>*
psBondParameter
;
CUDAStream
<
int4
>*
psBondAngleID1
;
CUDAStream
<
int2
>*
psBondAngleID2
;
CUDAStream
<
float2
>*
psBondAngleParameter
;
CUDAStream
<
int4
>*
psDihedralID1
;
CUDAStream
<
int4
>*
psDihedralID2
;
CUDAStream
<
float4
>*
psDihedralParameter
;
CUDAStream
<
int4
>*
psRbDihedralID1
;
CUDAStream
<
int4
>*
psRbDihedralID2
;
CUDAStream
<
float4
>*
psRbDihedralParameter1
;
CUDAStream
<
float2
>*
psRbDihedralParameter2
;
CUDAStream
<
int4
>*
psLJ14ID
;
CUDAStream
<
float4
>*
psLJ14Parameter
;
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
};
typedef
struct
_gpuContext
*
gpuContext
;
// Function prototypes
extern
"C"
bool
gpuIsAvailable
();
extern
"C"
int
gpuReadBondParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetBondParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
length
,
const
std
::
vector
<
float
>&
k
);
extern
"C"
int
gpuReadBondAngleParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetBondAngleParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
float
>&
angle
,
const
std
::
vector
<
float
>&
k
);
extern
"C"
int
gpuReadDihedralParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetDihedralParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
int
>&
atom4
,
const
std
::
vector
<
float
>&
k
,
const
std
::
vector
<
float
>&
phase
,
const
std
::
vector
<
int
>&
periodicity
);
extern
"C"
int
gpuReadRbDihedralParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetRbDihedralParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
int
>&
atom4
,
const
std
::
vector
<
float
>&
c0
,
const
std
::
vector
<
float
>&
c1
,
const
std
::
vector
<
float
>&
c2
,
const
std
::
vector
<
float
>&
c3
,
const
std
::
vector
<
float
>&
c4
,
const
std
::
vector
<
float
>&
c5
);
extern
"C"
int
gpuReadLJ14Parameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetLJ14Parameters
(
gpuContext
gpu
,
float
epsfac
,
float
fudge
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
);
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
);
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
int
gpuReadCoulombParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
vector
<
int
>
>&
exclusions
);
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
extern
"C"
int
gpuReadShakeParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetShakeParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
distance
,
const
std
::
vector
<
float
>&
invMass1
,
const
std
::
vector
<
float
>&
invMass2
,
float
tolerance
);
extern
"C"
int
gpuAllocateInitialBuffers
(
gpuContext
gpu
);
extern
"C"
void
gpuReadCoordinates
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetPositions
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
x
,
const
std
::
vector
<
float
>&
y
,
const
std
::
vector
<
float
>&
z
);
extern
"C"
void
gpuSetVelocities
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
x
,
const
std
::
vector
<
float
>&
y
,
const
std
::
vector
<
float
>&
z
);
extern
"C"
void
gpuSetMass
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
mass
);
extern
"C"
void
gpuInitializeRandoms
(
gpuContext
gpu
);
extern
"C"
void
*
gpuInitFromFile
(
char
*
fname
);
extern
"C"
void
*
gpuInit
(
int
numAtoms
);
extern
"C"
void
gpuSetIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
);
extern
"C"
void
gpuSetVerletIntegrationParameters
(
gpuContext
gpu
,
float
deltaT
);
extern
"C"
void
gpuSetBrownianIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
);
extern
"C"
void
gpuSetAndersenThermostatParameters
(
gpuContext
gpu
,
float
temperature
,
float
collisionProbability
);
extern
"C"
void
gpuShutDown
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
);
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpCoordinates
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpPrimeCoordinates
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpForces
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpAtomData
(
gpuContext
gpu
);
extern
"C"
bool
gpuCheckData
(
gpuContext
gpu
);
extern
"C"
void
gpuSetup
(
void
*
pVoid
);
extern
"C"
void
kCPUCalculate14
(
gpuContext
gpu
);
extern
"C"
void
kCPUCalculateLocalForces
(
gpuContext
gpu
);
extern
"C"
void
WriteArrayToFile1
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile2
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float2
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile3
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float3
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile4
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float4
>*
psPos
,
int
numPrint
);
extern
"C"
void
gpuDumpObcInfo
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
#endif //__GPUTYPES_H__
platforms/cuda/src/kernels/kBrownianUpdate.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define DeltaShake
static
__constant__
cudaGmxSimulation
cSim
;
void
SetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kBrownianUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
force
.
x
*
cSim
.
GDT
+
random4a
.
x
;
apos
.
y
+=
force
.
y
*
cSim
.
GDT
+
random4a
.
y
;
apos
.
z
+=
force
.
z
*
cSim
.
GDT
+
random4a
.
z
;
#else
apos
.
x
=
force
.
x
*
cSim
.
GDT
+
random4a
.
x
;
apos
.
y
=
force
.
y
*
cSim
.
GDT
+
random4a
.
y
;
apos
.
z
=
force
.
z
*
cSim
.
GDT
+
random4a
.
z
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kBrownianUpdatePart1
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart1\n");
kBrownianUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart1"
);
}
__global__
void
kBrownianUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kBrownianUpdatePart2
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart2\n");
kBrownianUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart2"
);
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateAndersenThermostat_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float
scale
=
(
random4a
.
w
<
cSim
.
collisionProbability
?
0.0
:
1.0
);
float
add
=
(
1.0
-
scale
)
*
sqrt
(
cSim
.
kT
*
velocity
.
w
);
velocity
.
x
=
scale
*
velocity
.
x
+
add
*
random4a
.
x
;
velocity
.
y
=
scale
*
velocity
.
y
+
add
*
random4a
.
y
;
velocity
.
z
=
scale
*
velocity
.
z
+
add
*
random4a
.
z
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kCalculateAndersenThermostat
(
gpuContext
gpu
)
{
// printf("kCalculateAndersenThermostat\n");
kCalculateAndersenThermostat_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateAndersenThermostat"
);
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
fx
;
float
fy
;
float
fz
;
float
eps2
;
float
sig2
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJForces_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJForces"
);
}
\ No newline at end of file
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
fx
;
float
fy
;
float
fz
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJForces_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateCDLJForces_12
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces_12\n");
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJForces_12"
);
}
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
float
q2
;
float
junk
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJObcGbsaForces1_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJObcGbsaForces1_12_kernel
();
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if
(
gpu
->
bRecalculateBornRadii
){
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
// int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1"
);
}
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJObcGbsaForces1_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
float
q2
=
apos
.
w
*
cSim
.
preFactor
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
int
j
=
y
+
tgx
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJObcGbsaForces1_12\n");
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1_12"
);
}
platforms/cuda/src/kernels/kCalculateLocalForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
extern
__shared__
Vectors
sV
[];
static
__constant__
cudaGmxSimulation
cSim
;
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{
\
dp
=
DOT3
(
v1
,
v2
);
\
float
norm1
=
DOT3
(
v1
,
v1
);
\
float
norm2
=
DOT3
(
v2
,
v2
);
\
dp
/=
sqrt
(
norm1
*
norm2
);
\
dp
=
min
(
dp
,
1.0
f
);
\
dp
=
max
(
dp
,
-
1.0
f
);
\
}
#define CROSS_PRODUCT(v1, v2, c) \
c
.
x
=
v1
.
y
*
v2
.
z
-
v1
.
z
*
v2
.
y
;
\
c
.
y
=
v1
.
z
*
v2
.
x
-
v1
.
x
*
v2
.
z
;
\
c
.
z
=
v1
.
x
*
v2
.
y
-
v1
.
y
*
v2
.
x
;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acos
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
3.14159265
f
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
;
\
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{
\
float
dp
;
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
dp
);
\
angle
=
acos
(
dp
);
\
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
cosine
);
\
angle
=
acos
(
cosine
);
\
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLECOSINEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
,
cosine
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateLocalForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
Vectors
*
A
=
&
sV
[
threadIdx
.
x
];
while
(
pos
<
cSim
.
bond_offset
)
{
if
(
pos
<
cSim
.
bonds
)
{
int4
atom
=
cSim
.
pBondID
[
pos
];
float4
atomA
=
cSim
.
pPosq
[
atom
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom
.
y
];
float2
bond
=
cSim
.
pBondParameter
[
pos
];
float
dx
=
atomB
.
x
-
atomA
.
x
;
float
dy
=
atomB
.
y
-
atomA
.
y
;
float
dz
=
atomB
.
z
-
atomA
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
float
deltaIdeal
=
r
-
bond
.
x
;
float
dEdR
=
bond
.
y
*
deltaIdeal
;
dEdR
=
(
r
>
0.0
f
)
?
(
dEdR
/
r
)
:
0.0
f
;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
bond_angle_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_offset
;
if
(
pos1
<
cSim
.
bond_angles
)
{
int4
atom1
=
cSim
.
pBondAngleID1
[
pos1
];
float2
bond_angle
=
cSim
.
pBondAngleParameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom1
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom1
.
y
];
float4
a3
=
cSim
.
pPosq
[
atom1
.
z
];
A
->
v0
.
x
=
a2
.
x
-
a1
.
x
;
A
->
v0
.
y
=
a2
.
y
-
a1
.
y
;
A
->
v0
.
z
=
a2
.
z
-
a1
.
z
;
A
->
v1
.
x
=
a2
.
x
-
a3
.
x
;
A
->
v1
.
y
=
a2
.
y
-
a3
.
y
;
A
->
v1
.
z
=
a2
.
z
-
a3
.
z
;
float3
cp
;
CROSS_PRODUCT
(
A
->
v0
,
A
->
v1
,
cp
);
float
rp
=
DOT3
(
cp
,
cp
);
//cx * cx + cy * cy + cz * cz;
rp
=
max
(
sqrt
(
rp
),
1.0e-06
f
);
float
r21
=
DOT3
(
A
->
v0
,
A
->
v0
);
// dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float
r23
=
DOT3
(
A
->
v1
,
A
->
v1
);
// dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float
dot
=
DOT3
(
A
->
v0
,
A
->
v1
);
// dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float
cosine
=
dot
/
sqrt
(
r21
*
r23
);
float
dEdR
;
GETPREFACTORSGIVENANGLECOSINE
(
cosine
,
bond_angle
,
dEdR
);
//printf("%11.4f %11.4f\n", cosine, dEdR);
float
termA
=
dEdR
/
(
r21
*
rp
);
float
termC
=
-
dEdR
/
(
r23
*
rp
);
float3
c21
;
float3
c23
;
CROSS_PRODUCT
(
A
->
v0
,
cp
,
c21
);
CROSS_PRODUCT
(
A
->
v1
,
cp
,
c23
);
c21
.
x
*=
termA
;
c21
.
y
*=
termA
;
c21
.
z
*=
termA
;
c23
.
x
*=
termC
;
c23
.
y
*=
termC
;
c23
.
z
*=
termC
;
int2
atom2
=
cSim
.
pBondAngleID2
[
pos1
];
unsigned
int
offset
=
atom1
.
x
+
atom1
.
w
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom1
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c21
.
x
;
force
.
y
+=
c21
.
y
;
force
.
z
+=
c21
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
y
+
atom2
.
x
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
-=
(
c21
.
x
+
c23
.
x
);
force
.
y
-=
(
c21
.
y
+
c23
.
y
);
force
.
z
-=
(
c21
.
z
+
c23
.
z
);
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
z
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c23
.
x
;
force
.
y
+=
c23
.
y
;
force
.
z
+=
c23
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_angle_offset
;
if
(
pos1
<
cSim
.
dihedrals
)
{
int4
atom1
=
cSim
.
pDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
);
float4
dihedral
=
cSim
.
pDihedralParameter
[
pos1
];
float
deltaAngle
=
dihedral
.
z
*
dihedralAngle
-
(
dihedral
.
y
*
3.14159265
f
/
180.0
f
);
float
sinDeltaAngle
=
sin
(
deltaAngle
);
float
dEdAngle
=
-
dihedral
.
x
*
dihedral
.
z
*
sinDeltaAngle
;
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrt
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
z
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
rb_dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
dihedral_offset
;
if
(
pos1
<
cSim
.
rb_dihedrals
)
{
int4
atom1
=
cSim
.
pRbDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
,
cosPhi
;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
,
cosPhi
);
if
(
dihedralAngle
<
0.0
f
)
{
dihedralAngle
+=
3.14159265
f
;
}
else
{
dihedralAngle
-=
3.14159265
f
;
}
cosPhi
=
-
cosPhi
;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4
dihedral1
=
cSim
.
pRbDihedralParameter1
[
pos1
];
float2
dihedral2
=
cSim
.
pRbDihedralParameter2
[
pos1
];
float
cosFactor
=
cosPhi
;
float
dEdAngle
=
-
dihedral1
.
y
;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle
-=
2.0
f
*
dihedral1
.
z
*
cosFactor
;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
3.0
f
*
dihedral1
.
w
*
cosFactor
;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
4.0
f
*
dihedral2
.
x
*
cosFactor
;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
5.0
f
*
dihedral2
.
y
*
cosFactor
;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle
*=
sin
(
dihedralAngle
);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrt
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pRbDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
z
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kCalculateLocalForces
(
gpuContext
gpu
)
{
// printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
localForces_threads_per_block
,
gpu
->
sim
.
localForces_threads_per_block
*
sizeof
(
Vectors
)
>>>
();
LAUNCHERROR
(
"kCalculateLocalForces"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sum
;
float
junk
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kReduceObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
;
float2
atom
=
cSim
.
pObcData
[
pos
];
// Get summed Born data
for
(
int
i
=
0
;
i
<
cSim
.
nonbondOutputBuffers
;
i
++
)
{
sum
+=
*
pSt
;
// printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
pSt
+=
cSim
.
stride
;
}
// Now calculate Born radius and OBC term.
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kReduceObcGbsaBornSum\n");
kReduceObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
gpu
->
bRecalculateBornRadii
=
false
;
if
(
0
){
static
int
step
=
0
;
int
numPrint
=
-
1
;
step
++
;
WriteArrayToFile1
(
gpu
,
"ObcGbsaBornBRad"
,
step
,
gpu
->
psBornRadii
,
numPrint
);
WriteArrayToFile1
(
gpu
,
"ObcGbsaBornSum"
,
step
,
gpu
->
psBornSum
,
numPrint
);
WriteArrayToFile2
(
gpu
,
"ObcGbsaObcData"
,
step
,
gpu
->
psObcData
,
numPrint
);
WriteArrayToFile4
(
gpu
,
"ObcGbsaBornPos"
,
step
,
gpu
->
psPosq4
,
numPrint
);
//gpuDumpCoordinates( gpu );
gpuDumpObcInfo
(
gpu
);
}
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
__global__
void
kCalculateObcGbsaBornSum_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
(
blockIdx
.
x
*
cSim
.
workUnits
)
/
gridDim
.
x
;
int
end
=
((
blockIdx
.
x
+
1
)
*
cSim
.
workUnits
)
/
gridDim
.
x
;
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
-
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
j
].
r
-
r
))
{
apos
.
w
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
tj
].
sr
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
psA
[
tj
].
r
<
(
ar
.
y
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateBornSum"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kReduceObcGbsaBornForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
float2
obcData
=
cSim
.
pObcData
[
pos
];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
// if (bornRadius > 0.0f)
// {
float
r
=
(
obcData
.
x
+
cSim
.
dielectricOffset
+
cSim
.
probeRadius
);
float
ratio6
=
pow
((
obcData
.
x
+
cSim
.
dielectricOffset
)
/
bornRadius
,
6.0
f
);
//float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
float
saTerm
=
cSim
.
surfaceAreaFactor
*
r
*
r
*
ratio6
;
totalForce
+=
saTerm
/
bornRadius
;
// 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
pFt
=
cSim
.
pBornForce
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kReduceObcGbsaBornForces1_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
//float2 obcData = cSim.pObcData[pos];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
/*
// if (bornRadius > 0.0f)
// {
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
*/
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
cSim
.
pBornForce
[
pos
]
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kAceGbsa_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
float2
obcData
=
cSim
.
pObcData
[
pos
];
float
totalForce
=
cSim
.
pBornForce
[
pos
];
//float totalForce = 0.0f;
float
r
=
(
obcData
.
x
+
cSim
.
dielectricOffset
+
cSim
.
probeRadius
);
float
ratio6
=
pow
((
obcData
.
x
+
cSim
.
dielectricOffset
)
/
bornRadius
,
6.0
f
);
/*
float ratio6 = (obcData.x + cSim.dielectricOffset) / bornRadius;
ratio6 = ratio6*ratio6;
ratio6 = ratio6*ratio6*ratio6;
*/
//float saTerm = 41.84f*cSim.surfaceAreaFactor * r * r * ratio6;
float
saTerm
=
cSim
.
surfaceAreaFactor
*
r
*
r
*
ratio6
;
totalForce
+=
saTerm
/
bornRadius
;
// 1.102 == Temp mysterious fudge factor, FIX FIX FIX
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
cSim
.
pBornForce
[
pos
]
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
)
{
//printf("kReduceObcGbsaBornForces QQ\n");
kReduceObcGbsaBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bf_reduce_threads_per_block
>>>
();
//kReduceObcGbsaBornForces1_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//kAceGbsa_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//printf("kReduceObcGbsaBornForces calling gpuDumpObcLoop1 QQ\n");
//gpuDumpObcLoop1(gpu);
}
__global__
void
kCalculateObcGbsaForces1_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float4
af
;
// Local atom fx, fy, fz, fb
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
br
=
br
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float
temp1
=
cSim
.
pBornRadii
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
br
=
temp1
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
af
.
w
=
sA
[
threadIdx
.
x
].
fb
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces1_12_kernel
();
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces1 version=%d sm_12=%d QQ\n", gpu->sm_version, SM_12);
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForce1"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces1_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float4
af
;
// Local atom fx, fy, fz, fb
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
br
=
br
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float
temp1
=
cSim
.
pBornRadii
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
br
=
temp1
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
af
.
w
=
sA
[
threadIdx
.
x
].
fb
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
)
{
// printf("kCalculateObcGbsaForces1_12\n");
kCalculateObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForce1_12"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sr2
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
// float sum;
// float oneOverR;
int
pos
;
int
wx
;
int
wy
;
};
__shared__
Atom
sA
[
G8X_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces2_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
sA
[
threadIdx
.
x
].
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
sA
[
threadIdx
.
x
].
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
sA
[
threadIdx
.
x
].
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
sA
[
threadIdx
.
x
].
wx
=
x
;
sA
[
threadIdx
.
x
].
wy
=
y
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born Forces and sum
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born Forces and sum
float
rScaledRadiusI
=
r
+
a
.
y
;
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
u_ij
=
1.0
f
/
rScaledRadiusI
;
l_ij2
=
l_ij
*
l_ij
;
u_ij2
=
u_ij
*
u_ij
;
t1
=
log
(
u_ij
/
l_ij
);
t2
=
(
l_ij2
-
u_ij2
);
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
term
=
0.125
f
*
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
sA
[
threadIdx
.
x
].
wx
+
tgx
+
(
sA
[
threadIdx
.
x
].
wy
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
sA
[
threadIdx
.
x
].
wy
+
tgx
+
(
sA
[
threadIdx
.
x
].
wx
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA
[
threadIdx
.
x
].
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces2_12_kernel
();
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces2\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateObcGbsaForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
else
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
//int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sr2
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
// float sum;
};
__shared__
Atom
sA
[
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces2_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Interleaved Atom I and J Born Forces and sum components
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rScaledRadiusI
=
r
+
a
.
y
;
float
rInverse
=
1.0
f
/
r
;
float
l_ijJ
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
l_ijI
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
float
u_ijJ
=
1.0
f
/
rScaledRadiusJ
;
float
u_ijI
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2J
=
l_ijJ
*
l_ijJ
;
float
l_ij2I
=
l_ijI
*
l_ijI
;
float
u_ij2J
=
u_ijJ
*
u_ijJ
;
float
u_ij2I
=
u_ijI
*
u_ijI
;
float
t1J
=
log
(
u_ijJ
/
l_ijJ
);
float
t1I
=
log
(
u_ijI
/
l_ijI
);
float
t2J
=
(
l_ij2J
-
u_ij2J
);
float
t2I
=
(
l_ij2I
-
u_ij2I
);
float
t3J
=
t2J
*
rInverse
;
float
t3I
=
t2I
*
rInverse
;
t1J
*=
rInverse
;
t1I
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3J
+
0.250
f
*
t1J
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Atom I Born sum term
// term = l_ijJ - u_ijJ +
// -0.25f * r * t2J +
// 0.50f * t1J +
// (0.25f * psA[tj].sr2) * t3J;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * (oneOverR - l_ijJ);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born sum term
term
=
0.125
f
*
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3I
+
0.250
f
*
t1I
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
// term = l_ijI - u_ijI +
// -0.25f * r * t2I +
// 0.50f * t1I +
// (0.25f * sr2) * t3I;
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
)
{
// printf("kCalculateObcGbsaForces2_12\n");
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForces2_12"
);
}
platforms/cuda/src/kernels/kForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define FABS(a) ((a) > 0.0f ? (a) : -(a))
static
__constant__
cudaGmxSimulation
cSim
;
void
SetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kClearForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride4
*
cSim
.
outputBuffers
)
{
((
float
*
)
cSim
.
pForce4
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearForces
(
gpuContext
gpu
)
{
// printf("kClearForces\n");
kClearForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearForces"
);
}
__global__
void
kClearBornForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
((
float
*
)
cSim
.
pBornForce
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearBornForces
(
gpuContext
gpu
)
{
// printf("kClearBornForces\n");
kClearBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornForces"
);
}
__global__
void
kReduceBornSumAndForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
// Reduce Born Sum
while
(
pos
-
cSim
.
stride4
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
-
cSim
.
stride4
;
float2
atom
=
cSim
.
pObcData
[
pos
-
cSim
.
stride4
];
// Get summed Born data
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f3
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f4
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
sum
+=
*
pSt
;
}
// Now calculate Born radius and OBC term.
cSim
.
pBornSum
[
pos
-
cSim
.
stride4
]
=
sum
;
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
-
cSim
.
stride4
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
-
cSim
.
stride4
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceBornSumAndForces
(
gpuContext
gpu
)
{
//printf("kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceBornSumAndForces"
);
#if 0
//gpuDumpObcLoop1( gpu );
/*
gpu->psForce4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %12.6f %12.6f %12.6f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z
);
} */
#endif
}
__global__
void
kReduceForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceForces
(
gpuContext
gpu
)
{
// printf("kReduceForces\n");
kReduceForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceForces"
);
}
platforms/cuda/src/kernels/kRandom.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
extern
__shared__
float3
sRand
[];
__global__
void
kGenerateRandoms_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
increment
=
blockDim
.
x
*
gridDim
.
x
;
// Read generator state
uint4
state
=
cSim
.
pRandomSeed
[
pos
];
unsigned
int
carry
=
0
;
float4
random4
;
float2
random2
;
while
(
pos
<
cSim
.
totalRandomsTimesTwo
)
{
// Generate 6 randoms in GRF
unsigned
int
pos1
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
unsigned
int
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
unsigned
int
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x1
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x3
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x5
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
sRand
[
pos1
].
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
pos1
+=
blockDim
.
x
;
}
// Output final randoms
float
c1
,
c2
;
if
(
pos
<
cSim
.
totalRandoms
)
{
c1
=
cSim
.
Yv
;
c2
=
cSim
.
V
;
}
else
{
c1
=
cSim
.
Yx
;
c2
=
cSim
.
X
;
}
random4
.
x
=
c1
*
sRand
[
threadIdx
.
x
].
x
;
random4
.
y
=
c1
*
sRand
[
threadIdx
.
x
].
y
;
random4
.
z
=
c1
*
sRand
[
threadIdx
.
x
].
z
;
random4
.
w
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
x
;
cSim
.
pRandom4a
[
pos
]
=
random4
;
random2
.
x
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
y
;
random2
.
y
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
z
;
cSim
.
pRandom2a
[
pos
]
=
random2
;
pos
+=
increment
;
}
// Write generator state
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
cSim
.
pRandomSeed
[
pos
]
=
state
;
}
void
kGenerateRandoms
(
gpuContext
gpu
)
{
kGenerateRandoms_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
random_threads_per_block
,
gpu
->
sim
.
random_threads_per_block
*
2
*
sizeof
(
float3
)
>>>
();
}
\ No newline at end of file
platforms/cuda/src/kernels/kUpdateShakeH.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#define DeltaShake
#include "gputypes.h"
struct
Atom
{
float3
rij1
;
float3
rij2
;
float3
rij3
;
float
M
;
float
d2
;
float
InvMassI
;
float
rij1sq
;
float
rij2sq
;
float
rij3sq
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetUpdateShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetUpdateShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
xVector
=
cSim
.
pxVector4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float2
random2a
=
cSim
.
pRandom2a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float3
Vmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
Vmh
.
x
=
xVector
.
x
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
x
;
Vmh
.
y
=
xVector
.
y
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
y
;
Vmh
.
z
=
xVector
.
z
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
z
;
float4
vVector
;
vVector
.
x
=
sqrtInvMass
*
random4a
.
w
;
vVector
.
y
=
sqrtInvMass
*
random2a
.
x
;
vVector
.
z
=
sqrtInvMass
*
random2a
.
y
;
vVector
.
w
=
0.0
f
;
cSim
.
pvVector4
[
pos
]
=
vVector
;
velocity
.
x
=
velocity
.
x
*
cSim
.
EM
+
velocity
.
w
*
force
.
x
*
cSim
.
TauOneMinusEM
+
vVector
.
x
-
cSim
.
EM
*
Vmh
.
x
;
velocity
.
y
=
velocity
.
y
*
cSim
.
EM
+
velocity
.
w
*
force
.
y
*
cSim
.
TauOneMinusEM
+
vVector
.
y
-
cSim
.
EM
*
Vmh
.
y
;
velocity
.
z
=
velocity
.
z
*
cSim
.
EM
+
velocity
.
w
*
force
.
z
*
cSim
.
TauOneMinusEM
+
vVector
.
z
-
cSim
.
EM
*
Vmh
.
z
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
+=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
+=
velocity
.
z
*
cSim
.
fix1
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
=
velocity
.
z
*
cSim
.
fix1
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kUpdatePart1CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float4
CM1
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
#if 0
float4 CM2 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM3 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM4 = { 0.0f, 0.0f, 0.0f, 0.0f };
if (cpos < gridDim.x)
CM1 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM2 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM3 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM4 = cSim.pLinearMomentum[cpos];
sCM[threadIdx.x].x = CM1.x + CM2.x + CM3.x + CM4.x;
sCM[threadIdx.x].y = CM1.y + CM2.y + CM3.y + CM4.y;
sCM[threadIdx.x].z = CM1.z + CM2.z + CM3.z + CM4.z;
#else
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
#endif
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
xVector
=
cSim
.
pxVector4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float2
random2a
=
cSim
.
pRandom2a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float3
Vmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
Vmh
.
x
=
xVector
.
x
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
x
;
Vmh
.
y
=
xVector
.
y
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
y
;
Vmh
.
z
=
xVector
.
z
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
z
;
float4
vVector
;
vVector
.
x
=
sqrtInvMass
*
random4a
.
w
;
vVector
.
y
=
sqrtInvMass
*
random2a
.
x
;
vVector
.
z
=
sqrtInvMass
*
random2a
.
y
;
vVector
.
w
=
0.0
f
;
cSim
.
pvVector4
[
pos
]
=
vVector
;
velocity
.
x
=
velocity
.
x
*
cSim
.
EM
+
velocity
.
w
*
force
.
x
*
cSim
.
TauOneMinusEM
+
vVector
.
x
-
cSim
.
EM
*
Vmh
.
x
-
sCM
[
0
].
x
;
velocity
.
y
=
velocity
.
y
*
cSim
.
EM
+
velocity
.
w
*
force
.
y
*
cSim
.
TauOneMinusEM
+
vVector
.
y
-
cSim
.
EM
*
Vmh
.
y
-
sCM
[
0
].
y
;
velocity
.
z
=
velocity
.
z
*
cSim
.
EM
+
velocity
.
w
*
force
.
z
*
cSim
.
TauOneMinusEM
+
vVector
.
z
-
cSim
.
EM
*
Vmh
.
z
-
sCM
[
0
].
z
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
+=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
+=
velocity
.
z
*
cSim
.
fix1
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
=
velocity
.
z
*
cSim
.
fix1
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kUpdatePart1
(
gpuContext
gpu
)
{
// printf("kUpdatePart1\n");
#if 0
static int iteration = 0;
if (iteration == 0)
{
gpu->psPosq4->Download();
gpu->psVelm4->Download();
printf("# %d atoms\n", gpu->natoms);
for (int i = 0; i < gpu->natoms; i++)
{
printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
);
}
}
iteration++;
#endif
#if 0
static const float KILO = 1e3; // Thousand
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // ()
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K))
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K))
static int iteration = 0;
// Check T
if (iteration % 1000 == 0)
{
gpu->psVelm4->Download();
float ke = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
float vx = gpu->psVelm4->_pSysStream[0][i].x;
float vy = gpu->psVelm4->_pSysStream[0][i].y;
float vz = gpu->psVelm4->_pSysStream[0][i].z;
float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
ke += m * (vx * vx + vy * vy + vz * vz);
}
float T = ke / (BOLTZ * gpu->sim.degreesOfFreedom);
printf("Iteration %d, Temperature is %f\n", iteration, T);
}
iteration++;
#endif
if
(
gpu
->
bRemoveCM
)
{
kUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kUpdatePart1"
);
}
}
__global__
void
kApplyFirstShake_kernel
()
{
__shared__
Atom
sA
[
G8X_THREADS_PER_BLOCK
];
Atom
*
psA
=
&
sA
[
threadIdx
.
x
];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ShakeConstraints
)
{
int4
atomID
=
cSim
.
pShakeID
[
pos
];
float4
params
=
cSim
.
pShakeParameter
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xpi
=
cSim
.
pPosqP
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xpj1
=
cSim
.
pPosqP
[
atomID
.
y
];
float4
apos2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
psA
->
InvMassI
=
params
.
x
;
psA
->
M
=
params
.
y
;
psA
->
d2
=
params
.
z
;
float
invMassJ
=
params
.
w
;
if
(
atomID
.
z
!=
-
1
)
{
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
xpj2
=
cSim
.
pPosqP
[
atomID
.
z
];
}
float4
apos3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atomID
.
w
!=
-
1
)
{
apos3
=
cSim
.
pOldPosq
[
atomID
.
w
];
xpj3
=
cSim
.
pPosqP
[
atomID
.
w
];
}
float3
xi
,
xj1
,
xj2
,
xj3
;
xi
.
x
=
apos
.
x
;
xi
.
y
=
apos
.
y
;
xi
.
z
=
apos
.
z
;
xj1
.
x
=
apos1
.
x
;
xj1
.
y
=
apos1
.
y
;
xj1
.
z
=
apos1
.
z
;
xj2
.
x
=
apos2
.
x
;
xj2
.
y
=
apos2
.
y
;
xj2
.
z
=
apos2
.
z
;
xj3
.
x
=
apos3
.
x
;
xj3
.
y
=
apos3
.
y
;
xj3
.
z
=
apos3
.
z
;
#ifndef DeltaShake
xpi
.
x
-=
xi
.
x
;
xpi
.
y
-=
xi
.
y
;
xpi
.
z
-=
xi
.
z
;
xpj1
.
x
-=
xj1
.
x
;
xpj1
.
y
-=
xj1
.
y
;
xpj1
.
z
-=
xj1
.
z
;
xpj2
.
x
-=
xj2
.
x
;
xpj2
.
y
-=
xj2
.
y
;
xpj2
.
z
-=
xj2
.
z
;
xpj3
.
x
-=
xj3
.
x
;
xpj3
.
y
-=
xj3
.
y
;
xpj3
.
z
-=
xj3
.
z
;
#endif
psA
->
rij1
.
x
=
xi
.
x
-
xj1
.
x
;
psA
->
rij1
.
y
=
xi
.
y
-
xj1
.
y
;
psA
->
rij1
.
z
=
xi
.
z
-
xj1
.
z
;
psA
->
rij2
.
x
=
xi
.
x
-
xj2
.
x
;
psA
->
rij2
.
y
=
xi
.
y
-
xj2
.
y
;
psA
->
rij2
.
z
=
xi
.
z
-
xj2
.
z
;
psA
->
rij3
.
x
=
xi
.
x
-
xj3
.
x
;
psA
->
rij3
.
y
=
xi
.
y
-
xj3
.
y
;
psA
->
rij3
.
z
=
xi
.
z
-
xj3
.
z
;
psA
->
rij1sq
=
psA
->
rij1
.
x
*
psA
->
rij1
.
x
+
psA
->
rij1
.
y
*
psA
->
rij1
.
y
+
psA
->
rij1
.
z
*
psA
->
rij1
.
z
;
psA
->
rij2sq
=
psA
->
rij2
.
x
*
psA
->
rij2
.
x
+
psA
->
rij2
.
y
*
psA
->
rij2
.
y
+
psA
->
rij2
.
z
*
psA
->
rij2
.
z
;
psA
->
rij3sq
=
psA
->
rij3
.
x
*
psA
->
rij3
.
x
+
psA
->
rij3
.
y
*
psA
->
rij3
.
y
+
psA
->
rij3
.
z
*
psA
->
rij3
.
z
;
float
ld1
=
psA
->
d2
-
psA
->
rij1sq
;
float
ld2
=
psA
->
d2
-
psA
->
rij2sq
;
float
ld3
=
psA
->
d2
-
psA
->
rij3sq
;
bool
converged
=
false
;
int
iteration
=
0
;
while
(
iteration
<
15
&&
!
converged
)
{
converged
=
true
;
float3
rpij
;
rpij
.
x
=
xpi
.
x
-
xpj1
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj1
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj1
.
z
;
float
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
float
rrpr
=
psA
->
rij1
.
x
*
rpij
.
x
+
psA
->
rij1
.
y
*
rpij
.
y
+
psA
->
rij1
.
z
*
rpij
.
z
;
float
diff
=
fabs
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij1sq
);
float3
dr
;
dr
.
x
=
psA
->
rij1
.
x
*
acor
;
dr
.
y
=
psA
->
rij1
.
y
*
acor
;
dr
.
z
=
psA
->
rij1
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj1
.
x
-=
dr
.
x
*
invMassJ
;
xpj1
.
y
-=
dr
.
y
*
invMassJ
;
xpj1
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
if
(
atomID
.
z
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj2
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj2
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj2
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij2
.
x
*
rpij
.
x
+
psA
->
rij2
.
y
*
rpij
.
y
+
psA
->
rij2
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij2sq
);
float3
dr
;
dr
.
x
=
psA
->
rij2
.
x
*
acor
;
dr
.
y
=
psA
->
rij2
.
y
*
acor
;
dr
.
z
=
psA
->
rij2
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj2
.
x
-=
dr
.
x
*
invMassJ
;
xpj2
.
y
-=
dr
.
y
*
invMassJ
;
xpj2
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
if
(
atomID
.
w
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj3
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj3
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj3
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij3
.
x
*
rpij
.
x
+
psA
->
rij3
.
y
*
rpij
.
y
+
psA
->
rij3
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij3sq
);
float3
dr
;
dr
.
x
=
psA
->
rij3
.
x
*
acor
;
dr
.
y
=
psA
->
rij3
.
y
*
acor
;
dr
.
z
=
psA
->
rij3
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj3
.
x
-=
dr
.
x
*
invMassJ
;
xpj3
.
y
-=
dr
.
y
*
invMassJ
;
xpj3
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
iteration
++
;
}
#ifndef DeltaShake
xpi
.
x
+=
xi
.
x
;
xpi
.
y
+=
xi
.
y
;
xpi
.
z
+=
xi
.
z
;
xpj1
.
x
+=
xj1
.
x
;
xpj1
.
y
+=
xj1
.
y
;
xpj1
.
z
+=
xj1
.
z
;
xpj2
.
x
+=
xj2
.
x
;
xpj2
.
y
+=
xj2
.
y
;
xpj2
.
z
+=
xj2
.
z
;
xpj3
.
x
+=
xj3
.
x
;
xpj3
.
y
+=
xj3
.
y
;
xpj3
.
z
+=
xj3
.
z
;
#endif
cSim
.
pPosqP
[
atomID
.
x
]
=
xpi
;
cSim
.
pPosqP
[
atomID
.
y
]
=
xpj1
;
if
(
atomID
.
z
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
z
]
=
xpj2
;
if
(
atomID
.
w
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
w
]
=
xpj3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kApplyFirstShake
(
gpuContext
gpu
)
{
// printf("kApplyFirstShake\n");
if
(
gpu
->
sim
.
ShakeConstraints
>
0
)
{
kApplyFirstShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
shake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplyFirstShake"
);
}
}
__global__
void
kUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
#ifndef DeltaShake
float4
apos
=
cSim
.
pPosq
[
pos
];
#endif
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float4
vVector
=
cSim
.
pvVector4
[
pos
];
float4
xVector
;
float4
random4b
=
cSim
.
pRandom4b
[
rpos
+
pos
];
float2
random2b
=
cSim
.
pRandom2b
[
rpos
+
pos
];
float3
Xmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
#ifdef DeltaShake
velocity
.
x
=
xPrime
.
x
*
cSim
.
oneOverFix1
;
velocity
.
y
=
xPrime
.
y
*
cSim
.
oneOverFix1
;
velocity
.
z
=
xPrime
.
z
*
cSim
.
oneOverFix1
;
#else
velocity
.
x
=
(
xPrime
.
x
-
apos
.
x
)
*
cSim
.
oneOverFix1
;
velocity
.
y
=
(
xPrime
.
y
-
apos
.
y
)
*
cSim
.
oneOverFix1
;
velocity
.
z
=
(
xPrime
.
z
-
apos
.
z
)
*
cSim
.
oneOverFix1
;
#endif
Xmh
.
x
=
vVector
.
x
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
x
;
Xmh
.
y
=
vVector
.
y
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
y
;
Xmh
.
z
=
vVector
.
z
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
z
;
xVector
.
x
=
sqrtInvMass
*
random4b
.
w
;
xVector
.
y
=
sqrtInvMass
*
random2b
.
x
;
xVector
.
z
=
sqrtInvMass
*
random2b
.
y
;
xPrime
.
x
+=
xVector
.
x
-
Xmh
.
x
;
xPrime
.
y
+=
xVector
.
y
-
Xmh
.
y
;
xPrime
.
z
+=
xVector
.
z
-
Xmh
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
cSim
.
pxVector4
[
pos
]
=
xVector
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
__global__
void
kUpdatePart2CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
#ifndef DeltaShake
float4
apos
=
cSim
.
pPosq
[
pos
];
#endif
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float4
vVector
=
cSim
.
pvVector4
[
pos
];
float4
xVector
;
float4
random4b
=
cSim
.
pRandom4b
[
rpos
+
pos
];
float2
random2b
=
cSim
.
pRandom2b
[
rpos
+
pos
];
float3
Xmh
;
float
mass
=
1.0
f
/
velocity
.
w
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
#ifdef DeltaShake
velocity
.
x
=
xPrime
.
x
*
cSim
.
oneOverFix1
;
velocity
.
y
=
xPrime
.
y
*
cSim
.
oneOverFix1
;
velocity
.
z
=
xPrime
.
z
*
cSim
.
oneOverFix1
;
#else
velocity
.
x
=
(
xPrime
.
x
-
apos
.
x
)
*
cSim
.
oneOverFix1
;
velocity
.
y
=
(
xPrime
.
y
-
apos
.
y
)
*
cSim
.
oneOverFix1
;
velocity
.
z
=
(
xPrime
.
z
-
apos
.
z
)
*
cSim
.
oneOverFix1
;
#endif
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
Xmh
.
x
=
vVector
.
x
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
x
;
Xmh
.
y
=
vVector
.
y
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
y
;
Xmh
.
z
=
vVector
.
z
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
z
;
xVector
.
x
=
sqrtInvMass
*
random4b
.
w
;
xVector
.
y
=
sqrtInvMass
*
random2b
.
x
;
xVector
.
z
=
sqrtInvMass
*
random2b
.
y
;
xPrime
.
x
+=
xVector
.
x
-
Xmh
.
x
;
xPrime
.
y
+=
xVector
.
y
-
Xmh
.
y
;
xPrime
.
z
+=
xVector
.
z
-
Xmh
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
cSim
.
pxVector4
[
pos
]
=
xVector
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0.0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kUpdatePart2
(
gpuContext
gpu
)
{
// printf("kUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kUpdatePart2"
);
}
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
__global__
void
kApplySecondShake_kernel
()
{
__shared__
Atom
sA
[
G8X_THREADS_PER_BLOCK
];
Atom
*
psA
=
&
sA
[
threadIdx
.
x
];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ShakeConstraints
)
{
int4
atomID
=
cSim
.
pShakeID
[
pos
];
float4
params
=
cSim
.
pShakeParameter
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xpi
=
cSim
.
pPosq
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xpj1
=
cSim
.
pPosq
[
atomID
.
y
];
float4
apos2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
psA
->
InvMassI
=
params
.
x
;
psA
->
M
=
params
.
y
;
psA
->
d2
=
params
.
z
;
float
invMassJ
=
params
.
w
;
if
(
atomID
.
z
!=
-
1
)
{
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
xpj2
=
cSim
.
pPosq
[
atomID
.
z
];
}
float4
apos3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atomID
.
w
!=
-
1
)
{
apos3
=
cSim
.
pOldPosq
[
atomID
.
w
];
xpj3
=
cSim
.
pPosq
[
atomID
.
w
];
}
float3
xi
,
xj1
,
xj2
,
xj3
;
xi
.
x
=
apos
.
x
;
xi
.
y
=
apos
.
y
;
xi
.
z
=
apos
.
z
;
xj1
.
x
=
apos1
.
x
;
xj1
.
y
=
apos1
.
y
;
xj1
.
z
=
apos1
.
z
;
xj2
.
x
=
apos2
.
x
;
xj2
.
y
=
apos2
.
y
;
xj2
.
z
=
apos2
.
z
;
xj3
.
x
=
apos3
.
x
;
xj3
.
y
=
apos3
.
y
;
xj3
.
z
=
apos3
.
z
;
#ifndef DeltaShake
xpi
.
x
-=
xi
.
x
;
xpi
.
y
-=
xi
.
y
;
xpi
.
z
-=
xi
.
z
;
xpj1
.
x
-=
xj1
.
x
;
xpj1
.
y
-=
xj1
.
y
;
xpj1
.
z
-=
xj1
.
z
;
xpj2
.
x
-=
xj2
.
x
;
xpj2
.
y
-=
xj2
.
y
;
xpj2
.
z
-=
xj2
.
z
;
xpj3
.
x
-=
xj3
.
x
;
xpj3
.
y
-=
xj3
.
y
;
xpj3
.
z
-=
xj3
.
z
;
#endif
psA
->
rij1
.
x
=
xi
.
x
-
xj1
.
x
;
psA
->
rij1
.
y
=
xi
.
y
-
xj1
.
y
;
psA
->
rij1
.
z
=
xi
.
z
-
xj1
.
z
;
psA
->
rij2
.
x
=
xi
.
x
-
xj2
.
x
;
psA
->
rij2
.
y
=
xi
.
y
-
xj2
.
y
;
psA
->
rij2
.
z
=
xi
.
z
-
xj2
.
z
;
psA
->
rij3
.
x
=
xi
.
x
-
xj3
.
x
;
psA
->
rij3
.
y
=
xi
.
y
-
xj3
.
y
;
psA
->
rij3
.
z
=
xi
.
z
-
xj3
.
z
;
psA
->
rij1sq
=
psA
->
rij1
.
x
*
psA
->
rij1
.
x
+
psA
->
rij1
.
y
*
psA
->
rij1
.
y
+
psA
->
rij1
.
z
*
psA
->
rij1
.
z
;
psA
->
rij2sq
=
psA
->
rij2
.
x
*
psA
->
rij2
.
x
+
psA
->
rij2
.
y
*
psA
->
rij2
.
y
+
psA
->
rij2
.
z
*
psA
->
rij2
.
z
;
psA
->
rij3sq
=
psA
->
rij3
.
x
*
psA
->
rij3
.
x
+
psA
->
rij3
.
y
*
psA
->
rij3
.
y
+
psA
->
rij3
.
z
*
psA
->
rij3
.
z
;
float
ld1
=
psA
->
d2
-
psA
->
rij1sq
;
float
ld2
=
psA
->
d2
-
psA
->
rij2sq
;
float
ld3
=
psA
->
d2
-
psA
->
rij3sq
;
bool
converged
=
false
;
int
iteration
=
0
;
while
(
iteration
<
15
&&
!
converged
)
{
converged
=
true
;
float3
rpij
;
rpij
.
x
=
xpi
.
x
-
xpj1
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj1
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj1
.
z
;
float
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
float
rrpr
=
psA
->
rij1
.
x
*
rpij
.
x
+
psA
->
rij1
.
y
*
rpij
.
y
+
psA
->
rij1
.
z
*
rpij
.
z
;
float
diff
=
fabs
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij1sq
);
float3
dr
;
dr
.
x
=
psA
->
rij1
.
x
*
acor
;
dr
.
y
=
psA
->
rij1
.
y
*
acor
;
dr
.
z
=
psA
->
rij1
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj1
.
x
-=
dr
.
x
*
invMassJ
;
xpj1
.
y
-=
dr
.
y
*
invMassJ
;
xpj1
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
if
(
atomID
.
z
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj2
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj2
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj2
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij2
.
x
*
rpij
.
x
+
psA
->
rij2
.
y
*
rpij
.
y
+
psA
->
rij2
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij2sq
);
float3
dr
;
dr
.
x
=
psA
->
rij2
.
x
*
acor
;
dr
.
y
=
psA
->
rij2
.
y
*
acor
;
dr
.
z
=
psA
->
rij2
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj2
.
x
-=
dr
.
x
*
invMassJ
;
xpj2
.
y
-=
dr
.
y
*
invMassJ
;
xpj2
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
if
(
atomID
.
w
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj3
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj3
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj3
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij3
.
x
*
rpij
.
x
+
psA
->
rij3
.
y
*
rpij
.
y
+
psA
->
rij3
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij3sq
);
float3
dr
;
dr
.
x
=
psA
->
rij3
.
x
*
acor
;
dr
.
y
=
psA
->
rij3
.
y
*
acor
;
dr
.
z
=
psA
->
rij3
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj3
.
x
-=
dr
.
x
*
invMassJ
;
xpj3
.
y
-=
dr
.
y
*
invMassJ
;
xpj3
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
iteration
++
;
}
xpi
.
x
+=
xi
.
x
;
xpi
.
y
+=
xi
.
y
;
xpi
.
z
+=
xi
.
z
;
xpj1
.
x
+=
xj1
.
x
;
xpj1
.
y
+=
xj1
.
y
;
xpj1
.
z
+=
xj1
.
z
;
xpj2
.
x
+=
xj2
.
x
;
xpj2
.
y
+=
xj2
.
y
;
xpj2
.
z
+=
xj2
.
z
;
xpj3
.
x
+=
xj3
.
x
;
xpj3
.
y
+=
xj3
.
y
;
xpj3
.
z
+=
xj3
.
z
;
cSim
.
pPosq
[
atomID
.
x
]
=
xpi
;
cSim
.
pPosq
[
atomID
.
y
]
=
xpj1
;
if
(
atomID
.
z
!=
-
1
)
cSim
.
pPosq
[
atomID
.
z
]
=
xpj2
;
if
(
atomID
.
w
!=
-
1
)
cSim
.
pPosq
[
atomID
.
w
]
=
xpj3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kApplyNoShake_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
NonShakeConstraints
)
{
int
atomID
=
cSim
.
pNonShakeID
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
];
float4
xpi
=
cSim
.
pPosq
[
atomID
];
xpi
.
x
+=
apos
.
x
;
xpi
.
y
+=
apos
.
y
;
xpi
.
z
+=
apos
.
z
;
cSim
.
pPosq
[
atomID
]
=
xpi
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kCPUShake2
(
gpuContext
gpu
)
{
}
void
kApplySecondShake
(
gpuContext
gpu
)
{
// printf("kApplySecondShake\n");
// kCPUShake2(gpu);
if
(
gpu
->
sim
.
ShakeConstraints
>
0
)
{
kApplySecondShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
shake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplySecondShake"
);
}
// handle non-Shake atoms
#ifdef DeltaShake
if
(
gpu
->
sim
.
NonShakeConstraints
>
0
)
{
//fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
kApplyNoShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
nonshake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplyNoShake"
);
}
#endif
}
platforms/cuda/src/kernels/kVerletUpdate.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define DeltaShake
static
__constant__
cudaGmxSimulation
cSim
;
void
SetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kVerletUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
dtOverMass
=
cSim
.
deltaT
*
velocity
.
w
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
velocity
.
x
+=
dtOverMass
*
force
.
x
;
velocity
.
y
+=
dtOverMass
*
force
.
y
;
velocity
.
z
+=
dtOverMass
*
force
.
z
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
+=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
+=
velocity
.
z
*
cSim
.
deltaT
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
=
velocity
.
z
*
cSim
.
deltaT
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kVerletUpdatePart1CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float4
CM1
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
while
(
pos
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
dtOverMass
=
cSim
.
deltaT
*
velocity
.
w
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
velocity
.
x
+=
dtOverMass
*
force
.
x
-
sCM
[
0
].
x
;
velocity
.
y
+=
dtOverMass
*
force
.
y
-
sCM
[
0
].
y
;
velocity
.
z
+=
dtOverMass
*
force
.
z
-
sCM
[
0
].
z
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
+=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
+=
velocity
.
z
*
cSim
.
deltaT
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
=
velocity
.
z
*
cSim
.
deltaT
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kVerletUpdatePart1
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart1\n");
if
(
gpu
->
bRemoveCM
)
{
kVerletUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
}
else
{
kVerletUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1"
);
}
}
__global__
void
kVerletUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kVerletUpdatePart2CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float
mass
=
1.0
f
/
velocity
.
w
;
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0.0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
}
void
kVerletUpdatePart2
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kVerletUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
}
else
{
kVerletUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment