Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
38f6c8f8
Commit
38f6c8f8
authored
Jan 27, 2009
by
Peter Eastman
Browse files
Checked in Cuda code
parent
95d79181
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9174 additions
and
0 deletions
+9174
-0
platforms/cuda/src/kernels/cudaKernels.h
platforms/cuda/src/kernels/cudaKernels.h
+98
-0
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+381
-0
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+2707
-0
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+277
-0
platforms/cuda/src/kernels/kBrownianUpdate.cu
platforms/cuda/src/kernels/kBrownianUpdate.cu
+152
-0
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+105
-0
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+388
-0
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+375
-0
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+454
-0
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
...forms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+422
-0
platforms/cuda/src/kernels/kCalculateLocalForces.cu
platforms/cuda/src/kernels/kCalculateLocalForces.cu
+495
-0
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+301
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+399
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+225
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+362
-0
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+336
-0
platforms/cuda/src/kernels/kForces.cu
platforms/cuda/src/kernels/kForces.cu
+261
-0
platforms/cuda/src/kernels/kRandom.cu
platforms/cuda/src/kernels/kRandom.cu
+185
-0
platforms/cuda/src/kernels/kUpdateShakeH.cu
platforms/cuda/src/kernels/kUpdateShakeH.cu
+959
-0
platforms/cuda/src/kernels/kVerletUpdate.cu
platforms/cuda/src/kernels/kVerletUpdate.cu
+292
-0
No files found.
platforms/cuda/src/kernels/cudaKernels.h
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
// Initialization
extern
void
kClearForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornSum
(
gpuContext
gpu
);
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
// Main loop
extern
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces
(
gpuContext
gpu
);
extern
void
kCalculateCDLJForces_12
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
);
extern
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
);
extern
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
);
extern
void
kCalculateLocalForces
(
gpuContext
gpu
);
extern
void
kCalculateAndersenThermostat
(
gpuContext
gpu
);
extern
void
kReduceBornSumAndForces
(
gpuContext
gpu
);
extern
void
kUpdatePart1
(
gpuContext
gpu
);
extern
void
kApplyFirstShake
(
gpuContext
gpu
);
extern
void
kUpdatePart2
(
gpuContext
gpu
);
extern
void
kApplySecondShake
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart1
(
gpuContext
gpu
);
extern
void
kVerletUpdatePart2
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart1
(
gpuContext
gpu
);
extern
void
kBrownianUpdatePart2
(
gpuContext
gpu
);
// Extras
extern
void
kReduceForces
(
gpuContext
gpu
);
extern
void
kClearBornForces
(
gpuContext
gpu
);
// Initializers
extern
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
);
extern
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
);
extern
void
SetForcesSim
(
gpuContext
gpu
);
extern
void
GetForcesSim
(
gpuContext
gpu
);
extern
void
SetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
GetUpdateShakeHSim
(
gpuContext
gpu
);
extern
void
SetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
GetVerletUpdateSim
(
gpuContext
gpu
);
extern
void
SetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
GetBrownianUpdateSim
(
gpuContext
gpu
);
extern
void
SetRandomSim
(
gpuContext
gpu
);
extern
void
GetRandomSim
(
gpuContext
gpu
);
platforms/cuda/src/kernels/cudatypes.h
0 → 100755
View file @
38f6c8f8
#ifndef CUDATYPES_H
#define CUDATYPES_H
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdarg.h>
#include <limits>
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using
namespace
std
;
#define RTERROR(status, s) \
if
(
status
!=
cudaSuccess
)
{
\
printf
(
"%s %s
\n
"
,
s
,
cudaGetErrorString
(
status
));
\
exit
(
-
1
);
\
}
#define LAUNCHERROR(s) \
{
\
cudaError_t
status
=
cudaGetLastError
();
\
if
(
status
!=
cudaSuccess
)
{
\
printf
(
"Error: %s launching kernel %s
\n
"
,
cudaGetErrorString
(
status
),
s
);
\
exit
(
-
1
);
\
}
\
}
// Pure virtual class to define an interface for objects resident both on GPU and CPU
struct
SoADeviceObject
{
virtual
void
Allocate
()
=
0
;
virtual
void
Deallocate
()
=
0
;
virtual
void
Upload
()
=
0
;
virtual
void
Download
()
=
0
;
};
template
<
typename
T
>
struct
CUDAStream
:
public
SoADeviceObject
{
unsigned
int
_length
;
unsigned
int
_subStreams
;
unsigned
int
_stride
;
T
**
_pSysStream
;
T
**
_pDevStream
;
T
*
_pSysData
;
T
*
_pDevData
;
CUDAStream
(
int
length
,
int
subStreams
=
1
);
CUDAStream
(
unsigned
int
length
,
unsigned
int
subStreams
=
1
);
CUDAStream
(
unsigned
int
length
,
int
subStreams
=
1
);
CUDAStream
(
int
length
,
unsigned
int
subStreams
=
1
);
virtual
~
CUDAStream
();
void
Allocate
();
void
Deallocate
();
void
Upload
();
void
Download
();
void
Collapse
(
unsigned
int
newstreams
=
1
,
unsigned
int
interleave
=
1
);
};
float
CompareStreams
(
CUDAStream
<
float
>&
s1
,
CUDAStream
<
float
>&
s2
,
float
tolerance
,
unsigned
int
maxindex
=
0
);
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
int
length
,
unsigned
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
unsigned
int
length
,
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
unsigned
int
length
,
unsigned
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::
CUDAStream
(
int
length
,
int
subStreams
)
:
_length
(
length
),
_subStreams
(
subStreams
),
_stride
((
length
+
0xf
)
&
0xfffffff0
)
{
Allocate
();
}
template
<
typename
T
>
CUDAStream
<
T
>::~
CUDAStream
()
{
Deallocate
();
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Allocate
()
{
cudaError_t
status
;
_pSysStream
=
new
T
*
[
_subStreams
];
_pDevStream
=
new
T
*
[
_subStreams
];
_pSysData
=
new
T
[
_subStreams
*
_stride
];
status
=
cudaMalloc
((
void
**
)
&
_pDevData
,
_stride
*
_subStreams
*
sizeof
(
T
));
RTERROR
(
status
,
"cudaMalloc CUDAStream::Allocate failed"
);
for
(
unsigned
int
i
=
0
;
i
<
_subStreams
;
i
++
)
{
_pSysStream
[
i
]
=
_pSysData
+
i
*
_stride
;
_pDevStream
[
i
]
=
_pDevData
+
i
*
_stride
;
}
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Deallocate
()
{
cudaError_t
status
;
delete
[]
_pSysStream
;
_pSysStream
=
NULL
;
delete
[]
_pDevStream
;
_pDevStream
=
NULL
;
delete
[]
_pSysData
;
_pSysData
=
NULL
;
status
=
cudaFree
(
_pDevData
);
RTERROR
(
status
,
"cudaFree CUDAStream::Deallocate failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Upload
()
{
cudaError_t
status
;
status
=
cudaMemcpy
(
_pDevData
,
_pSysData
,
_stride
*
_subStreams
*
sizeof
(
T
),
cudaMemcpyHostToDevice
);
RTERROR
(
status
,
"cudaMemcpy CUDAStream::Upload failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Download
()
{
cudaError_t
status
;
status
=
cudaMemcpy
(
_pSysData
,
_pDevData
,
_stride
*
_subStreams
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
RTERROR
(
status
,
"cudaMemcpy CUDAStream::Download failed"
);
}
template
<
typename
T
>
void
CUDAStream
<
T
>::
Collapse
(
unsigned
int
newstreams
,
unsigned
int
interleave
)
{
T
*
pTemp
=
new
T
[
_subStreams
*
_stride
];
unsigned
int
stream
=
0
;
unsigned
int
pos
=
0
;
unsigned
int
newstride
=
_stride
*
_subStreams
/
newstreams
;
unsigned
int
newlength
=
_length
*
_subStreams
/
newstreams
;
// Copy data into new format
for
(
unsigned
int
i
=
0
;
i
<
_length
;
i
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
_subStreams
;
j
++
)
{
pTemp
[
stream
*
newstride
+
pos
]
=
_pSysStream
[
j
][
i
];
stream
++
;
if
(
stream
==
newstreams
)
{
stream
=
0
;
pos
++
;
}
}
}
// Remap stream pointers;
for
(
unsigned
int
i
=
0
;
i
<
newstreams
;
i
++
)
{
_pSysStream
[
i
]
=
_pSysData
+
i
*
newstride
;
_pDevStream
[
i
]
=
_pDevData
+
i
*
newstride
;
}
// Copy data back intro original stream
for
(
unsigned
int
i
=
0
;
i
<
newlength
;
i
++
)
for
(
unsigned
int
j
=
0
;
j
<
newstreams
;
j
++
)
_pSysStream
[
j
][
i
]
=
pTemp
[
j
*
newstride
+
i
];
_stride
=
newstride
;
_length
=
newlength
;
_subStreams
=
newstreams
;
delete
[]
pTemp
;
}
static
const
int
GRID
=
32
;
static
const
int
GRIDBITS
=
5
;
static
const
int
G8X_NONBOND_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_NONBOND_THREADS_PER_BLOCK
=
320
;
static
const
int
G8X_BORNFORCE2_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
=
320
;
static
const
int
G8X_SHAKE_THREADS_PER_BLOCK
=
128
;
static
const
int
GT2XX_SHAKE_THREADS_PER_BLOCK
=
256
;
static
const
int
G8X_UPDATE_THREADS_PER_BLOCK
=
192
;
static
const
int
GT2XX_UPDATE_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_LOCALFORCES_THREADS_PER_BLOCK
=
192
;
static
const
int
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_THREADS_PER_BLOCK
=
256
;
static
const
int
G8X_RANDOM_THREADS_PER_BLOCK
=
256
;
static
const
int
GT2XX_RANDOM_THREADS_PER_BLOCK
=
384
;
static
const
int
G8X_NONBOND_WORKUNITS_PER_SM
=
220
;
static
const
int
GT2XX_NONBOND_WORKUNITS_PER_SM
=
256
;
struct
cudaGmxSimulation
{
// Constants
unsigned
int
atoms
;
// Number of atoms
unsigned
int
paddedNumberOfAtoms
;
// Padded number of atoms
unsigned
int
blocks
;
// Number of blocks to launch across linear kernels
unsigned
int
nonbond_blocks
;
// Number of blocks to launch across CDLJ and Born Force Part1
unsigned
int
bornForce2_blocks
;
// Number of blocks to launch across Born Force 2
unsigned
int
threads_per_block
;
// Threads per block to launch
unsigned
int
nonbond_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
bornForce2_threads_per_block
;
// Threads per block in nonbond kernel calls
unsigned
int
max_update_threads_per_block
;
// Maximum threads per block in update kernel calls
unsigned
int
update_threads_per_block
;
// Threads per block in update kernel calls
unsigned
int
bf_reduce_threads_per_block
;
// Threads per block in Born Force reduction calls
unsigned
int
bsf_reduce_threads_per_block
;
// Threads per block in Born Sum And Forces reduction calls
unsigned
int
max_shake_threads_per_block
;
// Maximum threads per block in shake kernel calls
unsigned
int
shake_threads_per_block
;
// Threads per block in shake kernel calls
unsigned
int
nonshake_threads_per_block
;
// Threads per block in nonshaking kernel call
unsigned
int
max_localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
localForces_threads_per_block
;
// Threads per block in local forces kernel calls
unsigned
int
random_threads_per_block
;
// Threads per block in RNG kernel calls
unsigned
int
workUnits
;
// Number of work units
unsigned
int
*
pWorkUnit
;
// Pointer to work units
unsigned
int
nonbond_workBlock
;
// Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned
int
bornForce2_workBlock
;
// Number of work units running second half of Born Forces calculation
unsigned
int
workUnitsPerSM
;
// Number of workblocks per SM
unsigned
int
nbWorkUnitsPerBlock
;
// Number of work units assigned to each nonbond block
unsigned
int
nbWorkUnitsPerBlockRemainder
;
// Remainder of work units to assign across lower numbered nonbond blocks
unsigned
int
bf2WorkUnitsPerBlock
;
// Number of work units assigned to each bornForce2 block
unsigned
int
bf2WorkUnitsPerBlockRemainder
;
// Remainder of work units to assign across lower numbered bornForce2 blocks
unsigned
int
stride
;
// Atomic attributes stride
unsigned
int
stride2
;
// Atomic attributes stride x 2
unsigned
int
stride3
;
// Atomic attributes stride x 3
unsigned
int
stride4
;
// Atomic attributes stride x 4
unsigned
int
exclusionStride
;
// Exclusion list stride = stride / GRID
unsigned
int
nonbondOutputBuffers
;
// Nonbond output buffers per nonbond call
unsigned
int
totalNonbondOutputBuffers
;
// Total nonbond output buffers
unsigned
int
outputBuffers
;
// Number of output buffers
float
bigFloat
;
// Floating point value used as a flag for Shaken atoms
float
epsfac
;
// Epsilon factor for CDLJ calculations
float
probeRadius
;
// SASA probe radius
float
surfaceAreaFactor
;
// ACE approximation surface area factor
float
electricConstant
;
// ACE approximation electric constant
float
forceConversionFactor
;
// kJ to kcal force conversion factor
float
preFactor
;
// Born electrostatic pre-factor
float
dielectricOffset
;
// Born dielectric offset
float
alphaOBC
;
// OBC alpha factor
float
betaOBC
;
// OBC beta factor
float
gammaOBC
;
// OBC gamma factor
float
deltaT
;
// Molecular dynamics deltaT constant
float
oneOverDeltaT
;
// 1/deltaT
float
B
;
// Molecular dynamics B constant
float
C
;
// Molecular dynamics C constant
float
D
;
// Molecular dynamics D constant
float
EPH
;
// Molecular dynamics EPH constant
float
EMH
;
// Molecular dynamics EMH constant
float
EM
;
// Molecular dynamics EM constant
float
EP
;
// Molecular dynamics EP constant
float
GDT
;
// Molecular dynamics GDT constant
float
OneMinusEM
;
// Molecular dynamics OneMinusEM constant
float
TauOneMinusEM
;
// Molecular dynamics TauOneMinusEM constant
float
TauDOverEMMinusOne
;
// Molecular dynamics TauDOverEMMinusOne constant
float
T
;
// Molecular dynamics T constant
float
kT
;
// Boltzmann's constant times T
float
V
;
// Molecular dynamics V constant
float
X
;
// Molecular dynamics X constant
float
Yv
;
// Molecular dynamics Yv constant
float
Yx
;
// Molecular dynamics Yx constant
float
tau
;
// Molecular dynamics tau constant
float
fix1
;
// Molecular dynamics fix1 constant
float
oneOverFix1
;
// Molecular dynamics reciprocal of fix1 constant
float
DOverTauC
;
// Molecular dynamics DOverTauC constant
float
collisionProbability
;
// Collision probability for Andersen thermostat
float2
*
pObcData
;
// Pointer to fixed Born data
float2
*
pAttr
;
// Pointer to additional atom attributes (sig, eps)
unsigned
int
bonds
;
// Number of bonds
int4
*
pBondID
;
// Bond atom and output buffer IDs
float2
*
pBondParameter
;
// Bond parameters
unsigned
int
bond_angles
;
// Number of bond angles
int4
*
pBondAngleID1
;
// Bond angle atom and first output buffer IDs
int2
*
pBondAngleID2
;
// Bond angle output buffer IDs
float2
*
pBondAngleParameter
;
// Bond angle parameters
unsigned
int
dihedrals
;
// Number of dihedrals
int4
*
pDihedralID1
;
// Dihedral IDs
int4
*
pDihedralID2
;
// Dihedral output buffer IDs
float4
*
pDihedralParameter
;
// Dihedral parameters
unsigned
int
rb_dihedrals
;
// Number of Ryckaert Bellemans dihedrals
int4
*
pRbDihedralID1
;
// Ryckaert Bellemans Dihedral IDs
int4
*
pRbDihedralID2
;
// Ryckaert Bellemans Dihedral output buffer IDs
float4
*
pRbDihedralParameter1
;
// Ryckaert Bellemans Dihedral parameters
float2
*
pRbDihedralParameter2
;
// Ryckaert Bellemans Dihedral parameters
unsigned
int
LJ14s
;
// Number of Lennard Jones 1-4 interactions
int4
*
pLJ14ID
;
// Lennard Jones 1-4 atom and output buffer IDs
float4
*
pLJ14Parameter
;
// Lennard Jones 1-4 parameters
float
inverseTotalMass
;
// Used in linear momentum removal
unsigned
int
ShakeConstraints
;
// Total number of Shake constraints
unsigned
int
NonShakeConstraints
;
// Total number of NonShake atoms
unsigned
int
maxShakeIterations
;
// Maximum shake iterations
unsigned
int
degreesOfFreedom
;
// Number of degrees of freedom in system
float
shakeTolerance
;
// Shake tolerance
float
InvMassJ
;
// Shake inverse mass for hydrogens
int
*
pNonShakeID
;
// Not Shaking atoms
int4
*
pShakeID
;
// Shake atoms and phase
float4
*
pShakeParameter
;
// Shake parameters
unsigned
int
*
pExclusion
;
// Nonbond exclusion data
unsigned
int
bond_offset
;
// Offset to end of bonds
unsigned
int
bond_angle_offset
;
// Offset to end of bond angles
unsigned
int
dihedral_offset
;
// Offset to end of dihedrals
unsigned
int
rb_dihedral_offset
;
// Offset to end of Ryckaert Bellemans dihedrals
unsigned
int
LJ14_offset
;
// Offset to end of Lennard Jones 1-4 parameters
// Mutable stuff
float4
*
pPosq
;
// Pointer to atom positions and charges
float4
*
pPosqP
;
// Pointer to mid-integration atom positions
float4
*
pOldPosq
;
// Pointer to old atom positions
float4
*
pVelm4
;
// Pointer to atom velocity and inverse mass
float4
*
pvVector4
;
// Pointer to atom v Vector
float4
*
pxVector4
;
// Pointer to atom x Vector
float4
*
pForce4
;
// Pointer to all force4 data
float4
*
pForce4a
;
// Pointer to first set of force4 data
float4
*
pForce4b
;
// Pointer to second set of force4 data
float4
*
pOutForce4
;
// Pointer to output float4 force
float
*
pBornForce
;
// Pointer to Born force data
float
*
pBornSum
;
// Pointer to Born Radii calculation output buffers
float
*
pBornRadii
;
// Pointer to Born Radii
float
*
pObcChain
;
// Pointer to OBC chain data
float4
*
pLinearMomentum
;
// Pointer to linear momentum
// Random numbers
float4
*
pRandom4a
;
// Pointer to first set of 4 random numbers
float4
*
pRandom4b
;
// Pointer to second set of 4 random numbers
float2
*
pRandom2a
;
// Pointer to first set of 2 random numbers
float2
*
pRandom2b
;
// Pointer to second set of 2 random numbers
uint4
*
pRandomSeed
;
// Pointer to random seeds
int
*
pRandomPosition
;
// Pointer to random number positions
unsigned
int
randoms
;
// Number of randoms
unsigned
int
totalRandoms
;
// Number of randoms plus overflow.
unsigned
int
totalRandomsTimesTwo
;
// Used for generating randoms
unsigned
int
randomIterations
;
// Number of iterations before regenerating randoms
unsigned
int
randomFrames
;
// Number of frames of random numbers
};
struct
Vectors
{
float3
v0
;
float3
v1
;
float3
v2
;
};
#endif
platforms/cuda/src/kernels/gpu.cpp
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <ctime>
#include <cmath>
#include <map>
#ifdef WIN32
#include <windows.h>
#else
#include <stdint.h>
#endif
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
#include "OpenMMException.h"
using
OpenMM
::
OpenMMException
;
#ifdef WIN32
typedef
unsigned
__int64
u64
;
typedef
signed
__int64
s64
;
#else
typedef
uint64_t
u64
;
typedef
int64_t
s64
;
#endif
typedef
unsigned
int
u32
;
typedef
float
f32
;
typedef
double
f64
;
typedef
char
ascii
;
typedef
char
utf8
;
typedef
unsigned
char
u8
;
typedef
signed
char
s8
;
typedef
unsigned
short
u16
;
typedef
signed
short
s16
;
typedef
struct
{
u8
type
[
4
];
f32
charge
;
f32
radius
;
}
FAH_ATOM
;
typedef
struct
{
u32
a
;
/* rule: a < b */
u32
b
;
}
FAH_BOND
;
typedef
struct
{
f32
x
;
f32
y
;
f32
z
;
}
FAH_XYZ
;
typedef
struct
{
u32
magic
;
u32
version
;
utf8
name
[
64
];
s64
timestamp
;
u64
iterations
;
u32
frames
;
u32
atom_count
;
u32
bond_count
;
/* v2 */
utf8
user_name
[
64
];
utf8
user_team
[
16
];
utf8
user_done
[
16
];
}
FAH_INFO
;
typedef
struct
{
u32
magic
;
u32
version
;
s64
timestamp
;
u64
iterations_done
;
u32
frames_done
;
f32
energy
;
f32
temperature
;
}
FAH_CURRENT
;
typedef
struct
{
FAH_INFO
info
;
FAH_CURRENT
current
;
FAH_ATOM
*
atoms
;
FAH_BOND
*
bonds
;
FAH_XYZ
*
xyz
;
}
PROTEIN
;
struct
ShakeCluster
{
int
centralID
;
int
peripheralID
[
3
];
int
size
;
float
distance
;
float
centralInvMass
,
peripheralInvMass
;
ShakeCluster
()
{
}
ShakeCluster
(
int
centralID
,
float
invMass
)
:
centralID
(
centralID
),
centralInvMass
(
invMass
),
size
(
0
)
{
}
void
addAtom
(
int
id
,
float
dist
,
float
invMass
)
{
if
(
size
==
3
)
throw
OpenMMException
(
"A single atom may only have three constraints"
);
if
(
size
>
0
&&
dist
!=
distance
)
throw
OpenMMException
(
"All constraints for a central atom must have the same distance"
);
if
(
size
>
0
&&
invMass
!=
peripheralInvMass
)
throw
OpenMMException
(
"All constraints for a central atom must have the same mass"
);
peripheralID
[
size
++
]
=
id
;
distance
=
dist
;
peripheralInvMass
=
invMass
;
}
};
static
const
float
dielectricOffset
=
0.009
f
;
static
const
float
PI
=
3.1415926535
f
;
static
const
float
probeRadius
=
0.14
f
;
static
const
float
forceConversionFactor
=
0.4184
f
;
//static const float surfaceAreaFactor = -6.0f * 0.06786f * forceConversionFactor * 1000.0f; // PI * 4.0f * 0.0049f * 1000.0f;
//static const float surfaceAreaFactor = -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
static
const
float
surfaceAreaFactor
=
-
6.0
f
*
PI
*
0.0216
f
*
1000.0
f
*
0.4184
f
;
//static const float surfaceAreaFactor = -1.7035573959e+001;
//static const float surfaceAreaFactor = -166.02691f;
//static const float surfaceAreaFactor = 1.0f;
static
const
float
alphaOBC
=
1.0
f
;
static
const
float
betaOBC
=
0.8
f
;
static
const
float
gammaOBC
=
4.85
f
;
static
const
float
kcalMolTokJNM
=
-
0.4184
f
;
static
const
float
electricConstant
=
-
166.02691
f
;
static
const
float
defaultInnerDielectric
=
1.0
f
;
static
const
float
defaultSolventDielectric
=
78.3
f
;
static
const
float
KILO
=
1e3
;
// Thousand
static
const
float
BOLTZMANN
=
1.380658e-23
f
;
// (J/K)
static
const
float
AVOGADRO
=
6.0221367e23
f
;
// ()
static
const
float
RGAS
=
BOLTZMANN
*
AVOGADRO
;
// (J/(mol K))
static
const
float
BOLTZ
=
(
RGAS
/
KILO
);
// (kJ/(mol K))
#define DUMP_PARAMETERS 0
#define DeltaShake
extern
"C"
int
gpuReadBondParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
bonds
;
infile
>>
bonds
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
bonds
);
vector
<
int
>
atom2
(
bonds
);
vector
<
float
>
length
(
bonds
);
vector
<
float
>
k
(
bonds
);
for
(
int
i
=
0
;
i
<
bonds
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
length
[
i
]
>>
k
[
i
];
}
gpuSetBondParameters
(
gpu
,
atom1
,
atom2
,
length
,
k
);
return
bonds
;
}
else
{
cout
<<
"Error opening harmonic bond parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetBondParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
length
,
const
vector
<
float
>&
k
)
{
int
bonds
=
atom1
.
size
();
gpu
->
sim
.
bonds
=
bonds
;
CUDAStream
<
int4
>*
psBondID
=
new
CUDAStream
<
int4
>
(
bonds
,
1
);
gpu
->
psBondID
=
psBondID
;
gpu
->
sim
.
pBondID
=
psBondID
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psBondParameter
=
new
CUDAStream
<
float2
>
(
bonds
,
1
);
gpu
->
psBondParameter
=
psBondParameter
;
gpu
->
sim
.
pBondParameter
=
psBondParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
bonds
;
i
++
)
{
psBondID
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psBondID
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psBondParameter
->
_pSysStream
[
0
][
i
].
x
=
length
[
i
];
psBondParameter
->
_pSysStream
[
0
][
i
].
y
=
k
[
i
];
psBondID
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psBondID
->
_pSysStream
[
0
][
i
].
x
]
++
;
psBondID
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psBondID
->
_pSysStream
[
0
][
i
].
y
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psBondID
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psBondParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondParameter
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psBondID
->
Upload
();
psBondParameter
->
Upload
();
}
extern
"C"
int
gpuReadBondAngleParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
bond_angles
;
infile
>>
bond_angles
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
bond_angles
);
vector
<
int
>
atom2
(
bond_angles
);
vector
<
int
>
atom3
(
bond_angles
);
vector
<
float
>
angle
(
bond_angles
);
vector
<
float
>
k
(
bond_angles
);
for
(
int
i
=
0
;
i
<
bond_angles
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
angle
[
i
]
>>
k
[
i
];
}
gpuSetBondAngleParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
angle
,
k
);
return
bond_angles
;
}
else
{
cout
<<
"Error opening harmonic bond angle parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetBondAngleParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
float
>&
angle
,
const
vector
<
float
>&
k
)
{
int
bond_angles
=
atom1
.
size
();
gpu
->
sim
.
bond_angles
=
bond_angles
;
CUDAStream
<
int4
>*
psBondAngleID1
=
new
CUDAStream
<
int4
>
(
bond_angles
,
1
);
gpu
->
psBondAngleID1
=
psBondAngleID1
;
gpu
->
sim
.
pBondAngleID1
=
psBondAngleID1
->
_pDevStream
[
0
];
CUDAStream
<
int2
>*
psBondAngleID2
=
new
CUDAStream
<
int2
>
(
bond_angles
,
1
);
gpu
->
psBondAngleID2
=
psBondAngleID2
;
gpu
->
sim
.
pBondAngleID2
=
psBondAngleID2
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psBondAngleParameter
=
new
CUDAStream
<
float2
>
(
bond_angles
,
1
);
gpu
->
psBondAngleParameter
=
psBondAngleParameter
;
gpu
->
sim
.
pBondAngleParameter
=
psBondAngleParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
bond_angles
;
i
++
)
{
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
x
=
angle
[
i
];
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
y
=
k
[
i
];
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psBondAngleParameter
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psBondAngleID1
->
Upload
();
psBondAngleID2
->
Upload
();
psBondAngleParameter
->
Upload
();
}
extern
"C"
int
gpuReadDihedralParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
dihedrals
;
infile
>>
dihedrals
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
dihedrals
);
vector
<
int
>
atom2
(
dihedrals
);
vector
<
int
>
atom3
(
dihedrals
);
vector
<
int
>
atom4
(
dihedrals
);
vector
<
float
>
k
(
dihedrals
);
vector
<
float
>
phase
(
dihedrals
);
vector
<
int
>
periodicity
(
dihedrals
);
for
(
int
i
=
0
;
i
<
dihedrals
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
atom4
[
i
]
>>
k
[
i
]
>>
phase
[
i
]
>>
periodicity
[
i
];
}
gpuSetDihedralParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
atom4
,
k
,
phase
,
periodicity
);
return
dihedrals
;
}
else
{
cout
<<
"Error opening dihedral parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetDihedralParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
int
>&
atom4
,
const
vector
<
float
>&
k
,
const
vector
<
float
>&
phase
,
const
vector
<
int
>&
periodicity
)
{
int
dihedrals
=
atom1
.
size
();
gpu
->
sim
.
dihedrals
=
dihedrals
;
CUDAStream
<
int4
>*
psDihedralID1
=
new
CUDAStream
<
int4
>
(
dihedrals
,
1
);
gpu
->
psDihedralID1
=
psDihedralID1
;
gpu
->
sim
.
pDihedralID1
=
psDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psDihedralID2
=
new
CUDAStream
<
int4
>
(
dihedrals
,
1
);
gpu
->
psDihedralID2
=
psDihedralID2
;
gpu
->
sim
.
pDihedralID2
=
psDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psDihedralParameter
=
new
CUDAStream
<
float4
>
(
dihedrals
,
1
);
gpu
->
psDihedralParameter
=
psDihedralParameter
;
gpu
->
sim
.
pDihedralParameter
=
psDihedralParameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
dihedrals
;
i
++
)
{
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
=
atom4
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
x
=
k
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
y
=
phase
[
i
];
psDihedralParameter
->
_pSysStream
[
0
][
i
].
z
=
(
float
)
periodicity
[
i
];
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psDihedralID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psDihedralParameter
->
_pSysStream
[
0
][
i
].
z
<<
endl
;
#endif
}
psDihedralID1
->
Upload
();
psDihedralID2
->
Upload
();
psDihedralParameter
->
Upload
();
}
extern
"C"
int
gpuReadRbDihedralParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
rb_dihedrals
;
infile
>>
rb_dihedrals
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
rb_dihedrals
);
vector
<
int
>
atom2
(
rb_dihedrals
);
vector
<
int
>
atom3
(
rb_dihedrals
);
vector
<
int
>
atom4
(
rb_dihedrals
);
vector
<
float
>
c0
(
rb_dihedrals
);
vector
<
float
>
c1
(
rb_dihedrals
);
vector
<
float
>
c2
(
rb_dihedrals
);
vector
<
float
>
c3
(
rb_dihedrals
);
vector
<
float
>
c4
(
rb_dihedrals
);
vector
<
float
>
c5
(
rb_dihedrals
);
gpu
->
sim
.
rb_dihedrals
=
rb_dihedrals
;
CUDAStream
<
int4
>*
psRbDihedralID1
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID1
=
psRbDihedralID1
;
gpu
->
sim
.
pRbDihedralID1
=
psRbDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psRbDihedralID2
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID2
=
psRbDihedralID2
;
gpu
->
sim
.
pRbDihedralID2
=
psRbDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psRbDihedralParameter1
=
new
CUDAStream
<
float4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter1
=
psRbDihedralParameter1
;
gpu
->
sim
.
pRbDihedralParameter1
=
psRbDihedralParameter1
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psRbDihedralParameter2
=
new
CUDAStream
<
float2
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter2
=
psRbDihedralParameter2
;
gpu
->
sim
.
pRbDihedralParameter2
=
psRbDihedralParameter2
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
rb_dihedrals
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
atom3
[
i
]
>>
atom4
[
i
]
>>
c0
[
i
]
>>
c1
[
i
]
>>
c2
[
i
]
>>
c3
[
i
]
>>
c4
[
i
]
>>
c5
[
i
];
}
gpuSetRbDihedralParameters
(
gpu
,
atom1
,
atom2
,
atom3
,
atom4
,
c0
,
c1
,
c2
,
c3
,
c4
,
c5
);
return
rb_dihedrals
;
}
else
{
cout
<<
"Error opening Ryckaert-Bellemans dihedral parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetRbDihedralParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
int
>&
atom3
,
const
vector
<
int
>&
atom4
,
const
vector
<
float
>&
c0
,
const
vector
<
float
>&
c1
,
const
vector
<
float
>&
c2
,
const
vector
<
float
>&
c3
,
const
vector
<
float
>&
c4
,
const
vector
<
float
>&
c5
)
{
int
rb_dihedrals
=
atom1
.
size
();
gpu
->
sim
.
rb_dihedrals
=
rb_dihedrals
;
CUDAStream
<
int4
>*
psRbDihedralID1
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID1
=
psRbDihedralID1
;
gpu
->
sim
.
pRbDihedralID1
=
psRbDihedralID1
->
_pDevStream
[
0
];
CUDAStream
<
int4
>*
psRbDihedralID2
=
new
CUDAStream
<
int4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralID2
=
psRbDihedralID2
;
gpu
->
sim
.
pRbDihedralID2
=
psRbDihedralID2
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psRbDihedralParameter1
=
new
CUDAStream
<
float4
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter1
=
psRbDihedralParameter1
;
gpu
->
sim
.
pRbDihedralParameter1
=
psRbDihedralParameter1
->
_pDevStream
[
0
];
CUDAStream
<
float2
>*
psRbDihedralParameter2
=
new
CUDAStream
<
float2
>
(
rb_dihedrals
,
1
);
gpu
->
psRbDihedralParameter2
=
psRbDihedralParameter2
;
gpu
->
sim
.
pRbDihedralParameter2
=
psRbDihedralParameter2
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
rb_dihedrals
;
i
++
)
{
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
=
atom3
[
i
];
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
=
atom4
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
x
=
c0
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
y
=
c1
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
z
=
c2
[
i
];
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
w
=
c3
[
i
];
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
x
=
c4
[
i
];
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
y
=
c5
[
i
];
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
]
++
;
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
]
++
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralID1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psRbDihedralParameter1
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psRbDihedralParameter2
->
_pSysStream
[
0
][
i
].
y
<<
endl
;
#endif
}
psRbDihedralID1
->
Upload
();
psRbDihedralID2
->
Upload
();
psRbDihedralParameter1
->
Upload
();
psRbDihedralParameter2
->
Upload
();
}
extern
"C"
int
gpuReadLJ14Parameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
float
epsfac
=
0.0
f
;
float
fudge
=
0.0
f
;
int
LJ14s
;
infile
>>
LJ14s
;
infile
.
get
(
buff
,
61
);
// cout << buff << endl;
infile
>>
epsfac
;
infile
.
get
(
buff
,
8
);
infile
>>
fudge
;
infile
.
getline
(
buff
,
512
);
// cout << buff << endl;
vector
<
int
>
atom1
(
LJ14s
);
vector
<
int
>
atom2
(
LJ14s
);
vector
<
float
>
c6
(
LJ14s
);
vector
<
float
>
c12
(
LJ14s
);
vector
<
float
>
q1
(
LJ14s
);
vector
<
float
>
q2
(
LJ14s
);
for
(
int
i
=
0
;
i
<
LJ14s
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
c6
[
i
]
>>
c12
[
i
]
>>
q1
[
i
]
>>
q2
[
i
];
}
gpuSetLJ14Parameters
(
gpu
,
epsfac
,
fudge
,
atom1
,
atom2
,
c6
,
c12
,
q1
,
q2
);
return
LJ14s
;
}
else
{
cout
<<
"Error opening Lennard-Jones 1-4 parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetLJ14Parameters
(
gpuContext
gpu
,
float
epsfac
,
float
fudge
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q1
,
const
vector
<
float
>&
q2
)
{
int
LJ14s
=
atom1
.
size
();
float
scale
=
epsfac
*
fudge
;
gpu
->
sim
.
LJ14s
=
LJ14s
;
CUDAStream
<
int4
>*
psLJ14ID
=
new
CUDAStream
<
int4
>
(
LJ14s
,
1
);
gpu
->
psLJ14ID
=
psLJ14ID
;
gpu
->
sim
.
pLJ14ID
=
psLJ14ID
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psLJ14Parameter
=
new
CUDAStream
<
float4
>
(
LJ14s
,
1
);
gpu
->
psLJ14Parameter
=
psLJ14Parameter
;
gpu
->
sim
.
pLJ14Parameter
=
psLJ14Parameter
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
LJ14s
;
i
++
)
{
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
=
atom1
[
i
];
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
=
atom2
[
i
];
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
=
gpu
->
pOutputBufferCounter
[
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
]
++
;
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
=
gpu
->
pOutputBufferCounter
[
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
]
++
;
float
p0
,
p1
,
p2
;
if
(
c12
[
i
]
==
0.0
f
)
{
p0
=
0.0
f
;
p1
=
1.0
f
;
}
else
{
p0
=
c6
[
i
]
*
c6
[
i
]
/
c12
[
i
];
p1
=
pow
(
c12
[
i
]
/
c6
[
i
],
1.0
f
/
6.0
f
);
}
p2
=
scale
*
q1
[
i
]
*
q2
[
i
];
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
x
=
p0
;
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
y
=
p1
;
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
z
=
p2
;
}
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
psLJ14Parameter
->
_pSysStream
[
0
][
i
].
z
<<
" "
<<
p0
<<
" "
<<
p1
<<
" "
<<
p2
<<
" "
<<
endl
;
#endif
psLJ14ID
->
Upload
();
psLJ14Parameter
->
Upload
();
}
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
)
{
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
if
(
s
==
gpu
->
gpAtomTable
[
i
].
name
)
{
return
gpu
->
gpAtomTable
[
i
].
r
;
}
}
return
0.0
f
;
}
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
)
{
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
if
(
s
==
gpu
->
gpAtomTable
[
i
].
name
)
{
return
gpu
->
gpAtomTable
[
i
].
symbol
;
}
}
return
' '
;
}
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
)
{
gpu
->
gAtomTypes
=
0
;
if
(
gpu
->
gpAtomTable
)
delete
[]
gpu
->
gpAtomTable
;
// Read file once to count atom types
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
int
skips
=
0
;
bool
skipflag
=
true
;
while
(
infile
.
getline
(
buff
,
512
))
{
if
(
buff
[
0
]
==
' '
)
{
skipflag
=
false
;
gpu
->
gAtomTypes
++
;
}
else
if
(
skipflag
)
skips
++
;
}
infile
.
close
();
gpu
->
gpAtomTable
=
new
gpuAtomType
[
gpu
->
gAtomTypes
];
ifstream
infile1
(
fname
);
for
(
int
i
=
0
;
i
<
skips
;
i
++
)
{
infile1
.
getline
(
buff
,
512
);
}
for
(
int
i
=
0
;
i
<
gpu
->
gAtomTypes
;
i
++
)
{
infile1
>>
gpu
->
gpAtomTable
[
i
].
name
>>
gpu
->
gpAtomTable
[
i
].
r
;
infile1
.
getline
(
buff
,
512
);
// Determine symbol
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.3
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'H'
;
else
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.6
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'O'
;
else
if
(
gpu
->
gpAtomTable
[
i
].
r
<
1.7
f
)
gpu
->
gpAtomTable
[
i
].
symbol
=
'N'
;
else
gpu
->
gpAtomTable
[
i
].
symbol
=
'C'
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
name
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
symbol
<<
" "
<<
gpu
->
gpAtomTable
[
i
].
r
<<
endl
;
#endif
}
return
gpu
->
gAtomTypes
;
}
else
{
cout
<<
"Error opening atom parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
int
gpuReadCoulombParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
1024
];
unsigned
int
coulombs
;
float
fudge
=
0.0
f
;
float
epsfac
=
1.0
f
;
infile
>>
coulombs
;
infile
.
get
(
buff
,
9
);
infile
>>
epsfac
;
infile
.
get
(
buff
,
8
);
infile
>>
fudge
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom
(
coulombs
);
vector
<
float
>
c6
(
coulombs
);
vector
<
float
>
c12
(
coulombs
);
vector
<
float
>
q
(
coulombs
);
vector
<
float
>
radius
(
coulombs
);
vector
<
float
>
scale
(
coulombs
);
vector
<
char
>
symbol
(
coulombs
);
vector
<
vector
<
int
>
>
exclusions
(
coulombs
);
unsigned
int
total_exclusions
=
0
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
int
junk
,
numExclusions
;
char
atype
[
512
];
infile
>>
junk
>>
c6
[
i
]
>>
c12
[
i
]
>>
q
[
i
]
>>
atype
>>
scale
[
i
]
>>
numExclusions
;
radius
[
i
]
=
gpuGetAtomicRadius
(
gpu
,
atype
);
symbol
[
i
]
=
gpuGetAtomicSymbol
(
gpu
,
atype
);
for
(
int
j
=
0
;
j
<
numExclusions
;
j
++
)
{
int
exclusion
;
infile
>>
exclusion
;
exclusions
[
i
].
push_back
(
exclusion
);
}
}
cout
<<
total_exclusions
<<
" total exclusions.
\n
"
;
gpuSetCoulombParameters
(
gpu
,
epsfac
,
atom
,
c6
,
c12
,
q
,
symbol
,
exclusions
);
gpuSetObcParameters
(
gpu
,
defaultInnerDielectric
,
defaultSolventDielectric
,
atom
,
radius
,
scale
);
return
coulombs
;
}
else
{
cout
<<
"Error opening Coulomb parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
c6
,
const
vector
<
float
>&
c12
,
const
vector
<
float
>&
q
,
const
vector
<
char
>&
symbol
,
const
vector
<
vector
<
int
>
>&
exclusions
)
{
unsigned
int
coulombs
=
atom
.
size
();
gpu
->
sim
.
epsfac
=
epsfac
;
unsigned
int
total_exclusions
=
0
;
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
float
p0
=
q
[
i
];
float
p1
=
0.5
f
,
p2
=
0.0
f
;
if
((
c6
[
i
]
>
0.0
f
)
&&
(
c12
[
i
]
>
0.0
f
))
{
p1
=
0.5
f
*
pow
(
c12
[
i
]
/
c6
[
i
],
1.0
f
/
6.0
f
);
p2
=
c6
[
i
]
*
sqrt
(
1.0
f
/
c12
[
i
]);
}
if
(
symbol
.
size
()
>
0
)
gpu
->
pAtomSymbol
[
i
]
=
symbol
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
p0
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
p1
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
p2
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
<<
" "
<<
p0
<<
" "
<<
p1
<<
" "
<<
p2
<<
" "
<<
exclusions
;
#endif
for
(
int
j
=
0
;
j
<
(
int
)
exclusions
[
i
].
size
();
j
++
)
{
#if (DUMP_PARAMETERS == 1)
cout
<<
" "
<<
exclusions
[
i
][
j
];
#endif
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
exclusions
[
i
][
j
]]
=
0
;
if
(
i
>=
(
int
)
exclusions
[
i
][
j
])
{
total_exclusions
++
;
}
}
#if (DUMP_PARAMETERS == 1)
cout
<<
endl
;
#endif
}
// Dummy out extra atom data
for
(
unsigned
int
i
=
coulombs
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
=
100000.0
f
+
i
*
10.0
f
;
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
}
// Add in remaining exclusions
for
(
unsigned
int
i
=
coulombs
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
gpu
->
sim
.
paddedNumberOfAtoms
;
j
++
)
{
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
=
0
;
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
]
=
0
;
}
}
gpu
->
psPosq4
->
Upload
();
gpu
->
psSigEps2
->
Upload
();
// Check for exclusion consistency
for
(
unsigned
int
i
=
0
;
i
<
coulombs
;
i
++
)
{
for
(
unsigned
int
j
=
i
;
j
<
coulombs
;
j
++
)
{
if
(
gpu
->
pExclusion
[
i
*
gpu
->
sim
.
paddedNumberOfAtoms
+
j
]
!=
gpu
->
pExclusion
[
j
*
gpu
->
sim
.
paddedNumberOfAtoms
+
i
])
cout
<<
"Warning: inconsistent exclusion betweens atoms "
<<
i
<<
" and "
<<
j
<<
endl
;
}
}
}
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
vector
<
int
>&
atom
,
const
vector
<
float
>&
radius
,
const
vector
<
float
>&
scale
)
{
unsigned
int
atoms
=
atom
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
atoms
;
i
++
)
{
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
radius
[
i
]
-
dielectricOffset
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
=
scale
[
i
]
*
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
;
#if (DUMP_PARAMETERS == 1)
cout
<<
i
<<
" "
<<
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
<<
" "
<<
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
;
#endif
}
// Dummy out extra atom data
for
(
unsigned
int
i
=
atoms
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
=
0.2
f
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
=
0.01
f
;
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
=
0.01
f
;
}
gpu
->
psBornRadii
->
Upload
();
gpu
->
psObcData
->
Upload
();
gpu
->
sim
.
preFactor
=
2.0
f
*
electricConstant
*
((
1.0
f
/
innerDielectric
)
-
(
1.0
f
/
solventDielectric
))
*
gpu
->
sim
.
forceConversionFactor
;
}
extern
"C"
int
gpuReadShakeParameters
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
if
(
!
infile
.
fail
())
{
char
buff
[
512
];
int
shake_constraints
;
infile
>>
buff
>>
shake_constraints
;
infile
.
getline
(
buff
,
512
);
vector
<
int
>
atom1
(
shake_constraints
);
vector
<
int
>
atom2
(
shake_constraints
);
vector
<
float
>
distance
(
shake_constraints
);
vector
<
float
>
invMass1
(
shake_constraints
);
vector
<
float
>
invMass2
(
shake_constraints
);
for
(
int
i
=
0
;
i
<
shake_constraints
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
atom1
[
i
]
>>
atom2
[
i
]
>>
distance
[
i
]
>>
invMass1
[
i
]
>>
invMass2
[
i
];
}
gpuSetShakeParameters
(
gpu
,
atom1
,
atom2
,
distance
,
invMass1
,
invMass2
,
1e-4
f
);
return
gpu
->
sim
.
ShakeConstraints
;
}
else
{
cout
<<
"Error opening Shake parameter file "
<<
fname
<<
endl
;
exit
(
-
1
);
}
return
0
;
}
extern
"C"
void
gpuSetShakeParameters
(
gpuContext
gpu
,
const
vector
<
int
>&
atom1
,
const
vector
<
int
>&
atom2
,
const
vector
<
float
>&
distance
,
const
vector
<
float
>&
invMass1
,
const
vector
<
float
>&
invMass2
,
float
tolerance
)
{
// Find how many constraints each atom is involved in.
vector
<
int
>
constraintCount
(
gpu
->
natoms
,
0
);
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
constraintCount
[
atom1
[
i
]]
++
;
constraintCount
[
atom2
[
i
]]
++
;
}
// Find clusters consisting of a central atom with up to three peripheral atoms.
map
<
int
,
ShakeCluster
>
clusters
;
for
(
int
i
=
0
;
i
<
atom1
.
size
();
i
++
)
{
// Determine which is the central atom.
bool
firstIsCentral
;
if
(
constraintCount
[
atom1
[
i
]]
>
1
)
firstIsCentral
=
true
;
else
if
(
constraintCount
[
atom2
[
i
]]
>
1
)
firstIsCentral
=
false
;
else
if
(
atom1
[
i
]
<
atom2
[
i
])
firstIsCentral
=
true
;
else
firstIsCentral
=
false
;
int
centralID
,
peripheralID
;
float
centralInvMass
,
peripheralInvMass
;
if
(
firstIsCentral
)
{
centralID
=
atom1
[
i
];
peripheralID
=
atom2
[
i
];
centralInvMass
=
invMass1
[
i
];
peripheralInvMass
=
invMass2
[
i
];
}
else
{
centralID
=
atom2
[
i
];
peripheralID
=
atom1
[
i
];
centralInvMass
=
invMass2
[
i
];
peripheralInvMass
=
invMass1
[
i
];
}
if
(
constraintCount
[
peripheralID
]
!=
1
)
throw
OpenMMException
(
"Only bonds to hydrogens may be constrained"
);
// Add it to the cluster.
if
(
clusters
.
find
(
centralID
)
==
clusters
.
end
())
{
clusters
[
centralID
]
=
ShakeCluster
(
centralID
,
centralInvMass
);
}
clusters
[
centralID
].
addAtom
(
peripheralID
,
distance
[
i
],
peripheralInvMass
);
}
// Fill in the Cuda streams.
CUDAStream
<
int4
>*
psShakeID
=
new
CUDAStream
<
int4
>
((
int
)
clusters
.
size
(),
1
);
gpu
->
psShakeID
=
psShakeID
;
gpu
->
sim
.
pShakeID
=
psShakeID
->
_pDevStream
[
0
];
CUDAStream
<
float4
>*
psShakeParameter
=
new
CUDAStream
<
float4
>
((
int
)
clusters
.
size
(),
1
);
gpu
->
psShakeParameter
=
psShakeParameter
;
gpu
->
sim
.
pShakeParameter
=
psShakeParameter
->
_pDevStream
[
0
];
gpu
->
sim
.
ShakeConstraints
=
clusters
.
size
();
int
index
=
0
;
for
(
map
<
int
,
ShakeCluster
>::
const_iterator
iter
=
clusters
.
begin
();
iter
!=
clusters
.
end
();
++
iter
)
{
const
ShakeCluster
&
cluster
=
iter
->
second
;
psShakeID
->
_pSysStream
[
0
][
index
].
x
=
cluster
.
centralID
;
psShakeID
->
_pSysStream
[
0
][
index
].
y
=
cluster
.
peripheralID
[
0
];
psShakeID
->
_pSysStream
[
0
][
index
].
z
=
cluster
.
size
>
1
?
cluster
.
peripheralID
[
1
]
:
-
1
;
psShakeID
->
_pSysStream
[
0
][
index
].
w
=
cluster
.
size
>
2
?
cluster
.
peripheralID
[
2
]
:
-
1
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
x
=
cluster
.
centralInvMass
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
y
=
0.5
f
/
(
cluster
.
centralInvMass
+
cluster
.
peripheralInvMass
);
psShakeParameter
->
_pSysStream
[
0
][
index
].
z
=
cluster
.
distance
*
cluster
.
distance
;
psShakeParameter
->
_pSysStream
[
0
][
index
].
w
=
cluster
.
peripheralInvMass
;
++
index
;
}
psShakeID
->
Upload
();
psShakeParameter
->
Upload
();
gpu
->
sim
.
shakeTolerance
=
tolerance
;
gpu
->
sim
.
shake_threads_per_block
=
(
gpu
->
sim
.
ShakeConstraints
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
shake_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
shake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
shake_threads_per_block
<
1
)
gpu
->
sim
.
shake_threads_per_block
=
1
;
#ifdef DeltaShake
// count number of atoms w/o constraint
int
count
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
if
(
constraintCount
[
i
]
==
0
)
count
++
;
// Allocate NonShake parameters
gpu
->
sim
.
NonShakeConstraints
=
count
;
if
(
count
||
true
){
CUDAStream
<
int
>*
psNonShakeID
=
new
CUDAStream
<
int
>
(
count
,
1
);
gpu
->
psNonShakeID
=
psNonShakeID
;
gpu
->
sim
.
pNonShakeID
=
psNonShakeID
->
_pDevStream
[
0
];
gpu
->
sim
.
nonshake_threads_per_block
=
(
count
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
nonshake_threads_per_block
>
gpu
->
sim
.
max_shake_threads_per_block
)
gpu
->
sim
.
nonshake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
if
(
gpu
->
sim
.
nonshake_threads_per_block
<
1
)
gpu
->
sim
.
nonshake_threads_per_block
=
1
;
// load indices
count
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
){
if
(
constraintCount
[
i
]
==
0
){
psNonShakeID
->
_pSysStream
[
0
][
count
++
]
=
i
;
}
}
psNonShakeID
->
Upload
();
}
else
{
gpu
->
sim
.
nonshake_threads_per_block
=
0
;
}
#endif
}
extern
"C"
int
gpuAllocateInitialBuffers
(
gpuContext
gpu
)
{
gpu
->
sim
.
atoms
=
gpu
->
natoms
;
gpu
->
sim
.
paddedNumberOfAtoms
=
((
gpu
->
sim
.
atoms
+
GRID
-
1
)
>>
GRIDBITS
)
<<
GRIDBITS
;
gpu
->
sim
.
degreesOfFreedom
=
3
*
gpu
->
sim
.
atoms
-
6
;
gpu
->
gpAtomTable
=
NULL
;
gpu
->
gAtomTypes
=
0
;
gpu
->
sim
.
nonbondOutputBuffers
=
gpu
->
sim
.
paddedNumberOfAtoms
/
GRID
;
gpu
->
sim
.
totalNonbondOutputBuffers
=
2
*
gpu
->
sim
.
nonbondOutputBuffers
;
gpu
->
sim
.
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
gpu
->
psPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
gpu
->
sim
.
stride
*
2
;
gpu
->
sim
.
stride3
=
gpu
->
sim
.
stride
*
3
;
gpu
->
sim
.
stride4
=
gpu
->
sim
.
stride
*
4
;
gpu
->
sim
.
pPosq
=
gpu
->
psPosq4
->
_pDevStream
[
0
];
gpu
->
sim
.
stride
=
gpu
->
psPosq4
->
_stride
;
gpu
->
sim
.
stride2
=
2
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
stride3
=
3
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
stride4
=
4
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
exclusionStride
=
gpu
->
sim
.
stride
/
GRID
;
gpu
->
psPosqP4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pPosqP
=
gpu
->
psPosqP4
->
_pDevStream
[
0
];
gpu
->
psOldPosq4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pOldPosq
=
gpu
->
psOldPosq4
->
_pDevStream
[
0
];
gpu
->
psVelm4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pVelm4
=
gpu
->
psVelm4
->
_pDevStream
[
0
];
gpu
->
psvVector4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pvVector4
=
gpu
->
psvVector4
->
_pDevStream
[
0
];
gpu
->
psxVector4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pxVector4
=
gpu
->
psxVector4
->
_pDevStream
[
0
];
gpu
->
psBornRadii
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pBornRadii
=
gpu
->
psBornRadii
->
_pDevStream
[
0
];
gpu
->
psObcChain
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcChain
=
gpu
->
psObcChain
->
_pDevStream
[
0
];
gpu
->
psSigEps2
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pAttr
=
gpu
->
psSigEps2
->
_pDevStream
[
0
];
gpu
->
psObcData
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
1
);
gpu
->
sim
.
pObcData
=
gpu
->
psObcData
->
_pDevStream
[
0
];
gpu
->
pAtomSymbol
=
new
unsigned
char
[
gpu
->
natoms
];
// Determine randoms
gpu
->
seed
=
(
unsigned
long
)
time
(
NULL
)
&
0x000fffff
;
gpu
->
sim
.
randomFrames
=
995
;
gpu
->
sim
.
randomIterations
=
gpu
->
sim
.
randomFrames
;
gpu
->
sim
.
randoms
=
gpu
->
sim
.
randomFrames
*
gpu
->
sim
.
paddedNumberOfAtoms
-
5
*
GRID
;
gpu
->
sim
.
totalRandoms
=
gpu
->
sim
.
randoms
+
gpu
->
sim
.
paddedNumberOfAtoms
;
gpu
->
sim
.
totalRandomsTimesTwo
=
gpu
->
sim
.
totalRandoms
*
2
;
gpu
->
psRandom4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
totalRandomsTimesTwo
,
1
);
gpu
->
psRandom2
=
new
CUDAStream
<
float2
>
(
gpu
->
sim
.
totalRandomsTimesTwo
,
1
);
gpu
->
psRandomPosition
=
new
CUDAStream
<
int
>
(
gpu
->
sim
.
blocks
,
1
);
gpu
->
psRandomSeed
=
new
CUDAStream
<
uint4
>
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
,
1
);
gpu
->
sim
.
pRandom4a
=
gpu
->
psRandom4
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandom2a
=
gpu
->
psRandom2
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandom4b
=
gpu
->
psRandom4
->
_pDevStream
[
0
]
+
gpu
->
sim
.
totalRandoms
;
gpu
->
sim
.
pRandom2b
=
gpu
->
psRandom2
->
_pDevStream
[
0
]
+
gpu
->
sim
.
totalRandoms
;
gpu
->
sim
.
pRandomPosition
=
gpu
->
psRandomPosition
->
_pDevStream
[
0
];
gpu
->
sim
.
pRandomSeed
=
gpu
->
psRandomSeed
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psRandomPosition
->
_pSysStream
[
0
][
i
]
=
0
;
}
int
seed
=
gpu
->
seed
|
((
gpu
->
seed
^
0xffffffff
)
<<
16
);
srand
(
seed
);
for
(
int
i
=
0
;
i
<
(
int
)
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
);
i
++
)
{
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
x
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
y
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
z
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
w
=
rand
();
}
float
randomValue
=
0.0
f
;
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
totalRandomsTimesTwo
;
i
++
)
{
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
x
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
y
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
z
=
randomValue
;
gpu
->
psRandom4
->
_pSysStream
[
0
][
i
].
w
=
randomValue
;
gpu
->
psRandom2
->
_pSysStream
[
0
][
i
].
x
=
randomValue
;
gpu
->
psRandom2
->
_pSysStream
[
0
][
i
].
y
=
randomValue
;
}
gpu
->
psRandomSeed
->
Upload
();
gpu
->
psRandom4
->
Upload
();
gpu
->
psRandom2
->
Upload
();
gpu
->
psRandomPosition
->
Upload
();
// Allocate and clear linear momentum buffer
gpu
->
psLinearMomentum
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
blocks
,
1
);
gpu
->
sim
.
pLinearMomentum
=
gpu
->
psLinearMomentum
->
_pDevStream
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psLinearMomentum
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
}
gpu
->
psLinearMomentum
->
Upload
();
return
1
;
}
extern
"C"
void
gpuReadCoordinates
(
gpuContext
gpu
,
char
*
fname
)
{
ifstream
infile
(
fname
);
gpu
->
natoms
=
0
;
char
buff
[
512
];
infile
>>
buff
>>
gpu
->
natoms
;
infile
.
getline
(
buff
,
511
);
float
totalMass
=
0.0
f
;
gpuAllocateInitialBuffers
(
gpu
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
>>
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
>>
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
// Accumulate mass
totalMass
+=
1.0
f
/
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
;
}
gpu
->
sim
.
inverseTotalMass
=
1.0
f
/
totalMass
;
gpu
->
psPosq4
->
Upload
();
gpu
->
psVelm4
->
Upload
();
gpu
->
psxVector4
->
Upload
();
}
extern
"C"
void
gpuSetPositions
(
gpuContext
gpu
,
const
vector
<
float
>&
x
,
const
vector
<
float
>&
y
,
const
vector
<
float
>&
z
)
{
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
=
x
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
=
y
[
i
];
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
=
z
[
i
];
}
gpu
->
psPosq4
->
Upload
();
// set flag to recalculate Born radii
gpu
->
bRecalculateBornRadii
=
true
;
}
extern
"C"
void
gpuSetVelocities
(
gpuContext
gpu
,
const
vector
<
float
>&
x
,
const
vector
<
float
>&
y
,
const
vector
<
float
>&
z
)
{
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
=
x
[
i
];
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
=
y
[
i
];
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
=
z
[
i
];
}
gpu
->
psVelm4
->
Upload
();
}
extern
"C"
void
gpuSetMass
(
gpuContext
gpu
,
const
vector
<
float
>&
mass
)
{
float
totalMass
=
0.0
f
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
=
1.0
f
/
mass
[
i
];
totalMass
+=
mass
[
i
];
}
gpu
->
sim
.
inverseTotalMass
=
1.0
f
/
totalMass
;
gpu
->
psVelm4
->
Upload
();
}
extern
"C"
void
gpuInitializeRandoms
(
gpuContext
gpu
)
{
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
blocks
;
i
++
)
{
gpu
->
psRandomPosition
->
_pSysStream
[
0
][
i
]
=
0
;
}
int
seed
=
gpu
->
seed
|
((
gpu
->
seed
^
0xffffffff
)
<<
16
);
srand
(
seed
);
for
(
int
i
=
0
;
i
<
(
int
)
(
gpu
->
sim
.
blocks
*
gpu
->
sim
.
random_threads_per_block
);
i
++
)
{
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
x
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
y
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
z
=
rand
();
gpu
->
psRandomSeed
->
_pSysStream
[
0
][
i
].
w
=
rand
();
}
gpu
->
psRandomPosition
->
Upload
();
gpu
->
psRandomSeed
->
Upload
();
gpuSetConstants
(
gpu
);
kGenerateRandoms
(
gpu
);
return
;
}
extern
"C"
bool
gpuIsAvailable
()
{
int
deviceCount
;
cudaGetDeviceCount
(
&
deviceCount
);
return
(
deviceCount
>
0
);
}
extern
"C"
void
*
gpuInitFromFile
(
char
*
fname
)
{
ifstream
infile
(
fname
);
int
numAtoms
=
0
;
char
buff
[
512
];
infile
>>
buff
>>
numAtoms
;
gpuContext
gpu
=
(
gpuContext
)
gpuInit
(
numAtoms
);
vector
<
float
>
x
(
numAtoms
),
y
(
numAtoms
),
z
(
numAtoms
),
charge
(
numAtoms
),
vx
(
numAtoms
),
vy
(
numAtoms
),
vz
(
numAtoms
),
mass
(
numAtoms
);
infile
.
getline
(
buff
,
511
);
float
totalMass
=
0.0
f
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
int
junk
;
infile
>>
junk
>>
x
[
i
]
>>
y
[
i
]
>>
z
[
i
]
>>
charge
[
i
]
>>
vx
[
i
]
>>
vy
[
i
]
>>
vz
[
i
]
>>
mass
[
i
];
mass
[
i
]
=
1.0
f
/
mass
[
i
];
}
gpuSetPositions
(
gpu
,
x
,
y
,
z
);
gpuSetVelocities
(
gpu
,
vx
,
vy
,
vz
);
gpuSetMass
(
gpu
,
mass
);
return
(
void
*
)
gpu
;
}
extern
"C"
void
*
gpuInit
(
int
numAtoms
)
{
gpuContext
gpu
=
new
_gpuContext
;
int
LRFSize
=
0
;
int
SMCount
=
0
;
int
SMMajor
=
0
;
int
SMMinor
=
0
;
// Get adapter
unsigned
int
device
=
0
;
char
*
pAdapter
;
pAdapter
=
getenv
(
"NV_FAH_DEVICE"
);
if
(
pAdapter
!=
NULL
)
{
sscanf
(
pAdapter
,
"%d"
,
&
device
);
}
cudaError_t
status
=
cudaSetDevice
(
device
);
RTERROR
(
status
,
"Error setting CUDA device"
)
// Determine which core to run on
#if 0
SYSTEM_INFO info;
GetSystemInfo(&info);
unsigned int cores = info.dwNumberOfProcessors;
if (cores > 1)
{
HANDLE hproc = GetCurrentProcess();
unsigned int core = (cores - 1) - (device % (cores - 1));
unsigned int mask = 1 << core;
SetProcessAffinityMask(hproc, mask);
}
#endif
// Determine kernel call configuration
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
0
);
// Determine SM version
if
(
deviceProp
.
major
==
1
)
{
switch
(
deviceProp
.
minor
)
{
case
0
:
case
1
:
gpu
->
sm_version
=
SM_10
;
gpu
->
sim
.
workUnitsPerSM
=
G8X_NONBOND_WORKUNITS_PER_SM
;
break
;
default:
gpu
->
sm_version
=
SM_12
;
gpu
->
sim
.
workUnitsPerSM
=
GT2XX_NONBOND_WORKUNITS_PER_SM
;
break
;
}
}
gpu
->
sim
.
nonbond_blocks
=
deviceProp
.
multiProcessorCount
;
gpu
->
sim
.
bornForce2_blocks
=
deviceProp
.
multiProcessorCount
;
gpu
->
sim
.
blocks
=
deviceProp
.
multiProcessorCount
;
if
(
deviceProp
.
regsPerBlock
==
8192
)
{
gpu
->
sim
.
nonbond_threads_per_block
=
G8X_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
bornForce2_threads_per_block
=
G8X_BORNFORCE2_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_shake_threads_per_block
=
G8X_SHAKE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_update_threads_per_block
=
G8X_UPDATE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_localForces_threads_per_block
=
G8X_LOCALFORCES_THREADS_PER_BLOCK
;
gpu
->
sim
.
threads_per_block
=
G8X_THREADS_PER_BLOCK
;
gpu
->
sim
.
random_threads_per_block
=
G8X_RANDOM_THREADS_PER_BLOCK
;
}
else
{
gpu
->
sim
.
nonbond_threads_per_block
=
GT2XX_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
bornForce2_threads_per_block
=
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_shake_threads_per_block
=
GT2XX_SHAKE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_update_threads_per_block
=
GT2XX_UPDATE_THREADS_PER_BLOCK
;
gpu
->
sim
.
max_localForces_threads_per_block
=
GT2XX_LOCALFORCES_THREADS_PER_BLOCK
;
gpu
->
sim
.
threads_per_block
=
GT2XX_NONBOND_THREADS_PER_BLOCK
;
gpu
->
sim
.
random_threads_per_block
=
GT2XX_RANDOM_THREADS_PER_BLOCK
;
}
gpu
->
sim
.
shake_threads_per_block
=
gpu
->
sim
.
max_shake_threads_per_block
;
gpu
->
sim
.
localForces_threads_per_block
=
gpu
->
sim
.
max_localForces_threads_per_block
;
gpu
->
natoms
=
numAtoms
;
gpuAllocateInitialBuffers
(
gpu
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
x
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
y
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
z
=
0.0
f
;
gpu
->
psxVector4
->
_pSysStream
[
0
][
i
].
w
=
0.0
f
;
}
gpu
->
psxVector4
->
Upload
();
gpu
->
iterations
=
0
;
gpu
->
sim
.
update_threads_per_block
=
(
gpu
->
natoms
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
if
(
gpu
->
sim
.
update_threads_per_block
>
gpu
->
sim
.
max_update_threads_per_block
)
gpu
->
sim
.
update_threads_per_block
=
gpu
->
sim
.
max_update_threads_per_block
;
if
(
gpu
->
sim
.
update_threads_per_block
<
1
)
gpu
->
sim
.
update_threads_per_block
=
1
;
gpu
->
sim
.
bf_reduce_threads_per_block
=
gpu
->
sim
.
update_threads_per_block
;
gpu
->
sim
.
bsf_reduce_threads_per_block
=
(
gpu
->
sim
.
stride4
+
gpu
->
natoms
+
gpu
->
sim
.
blocks
-
1
)
/
gpu
->
sim
.
blocks
;
gpu
->
sim
.
bsf_reduce_threads_per_block
=
((
gpu
->
sim
.
bsf_reduce_threads_per_block
+
(
GRID
-
1
))
/
GRID
)
*
GRID
;
if
(
gpu
->
sim
.
bsf_reduce_threads_per_block
>
gpu
->
sim
.
threads_per_block
)
gpu
->
sim
.
bsf_reduce_threads_per_block
=
gpu
->
sim
.
threads_per_block
;
if
(
gpu
->
sim
.
bsf_reduce_threads_per_block
<
1
)
gpu
->
sim
.
bsf_reduce_threads_per_block
=
1
;
// Initialize constants to reasonable values
gpu
->
sim
.
probeRadius
=
probeRadius
;
gpu
->
sim
.
surfaceAreaFactor
=
surfaceAreaFactor
;
gpu
->
sim
.
electricConstant
=
electricConstant
;
gpu
->
sim
.
bigFloat
=
99999999.0
f
;
gpu
->
sim
.
forceConversionFactor
=
forceConversionFactor
;
gpu
->
sim
.
preFactor
=
2.0
f
*
electricConstant
*
((
1.0
f
/
defaultInnerDielectric
)
-
(
1.0
f
/
defaultSolventDielectric
))
*
gpu
->
sim
.
forceConversionFactor
;
gpu
->
sim
.
dielectricOffset
=
dielectricOffset
;
gpu
->
sim
.
alphaOBC
=
alphaOBC
;
gpu
->
sim
.
betaOBC
=
betaOBC
;
gpu
->
sim
.
gammaOBC
=
gammaOBC
;
gpuSetIntegrationParameters
(
gpu
,
1.0
f
,
2.0e-3
f
,
300.0
f
);
gpu
->
sim
.
maxShakeIterations
=
15
;
gpu
->
sim
.
shakeTolerance
=
1.0e-04
f
*
2.0
f
;
gpu
->
sim
.
InvMassJ
=
9.920635e-001
f
;
gpu
->
grid
=
GRID
;
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
false
;
gpu
->
bRecalculateBornRadii
=
true
;
gpuInitializeRandoms
(
gpu
);
// To be determined later
gpu
->
psLJ14ID
=
NULL
;
gpu
->
psForce4
=
NULL
;
gpu
->
sim
.
pForce4
=
NULL
;
gpu
->
sim
.
pForce4a
=
NULL
;
gpu
->
sim
.
pForce4b
=
NULL
;
gpu
->
psBornForce
=
NULL
;
gpu
->
sim
.
pBornForce
=
NULL
;
gpu
->
psBornSum
=
NULL
;
gpu
->
sim
.
pBornSum
=
NULL
;
gpu
->
psBondID
=
NULL
;
gpu
->
psBondParameter
=
NULL
;
gpu
->
psBondAngleID1
=
NULL
;
gpu
->
psBondAngleID2
=
NULL
;
gpu
->
psBondAngleParameter
=
NULL
;
gpu
->
psDihedralID1
=
NULL
;
gpu
->
psDihedralID2
=
NULL
;
gpu
->
psDihedralParameter
=
NULL
;
gpu
->
psRbDihedralID1
=
NULL
;
gpu
->
psRbDihedralID2
=
NULL
;
gpu
->
psRbDihedralParameter1
=
NULL
;
gpu
->
psRbDihedralParameter2
=
NULL
;
gpu
->
psLJ14ID
=
NULL
;
gpu
->
psLJ14Parameter
=
NULL
;
gpu
->
psShakeID
=
NULL
;
gpu
->
psShakeParameter
=
NULL
;
gpu
->
psExclusion
=
NULL
;
gpu
->
psWorkUnit
=
NULL
;
// Initialize output buffer before reading parameters
gpu
->
pOutputBufferCounter
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
];
memset
(
gpu
->
pOutputBufferCounter
,
0
,
gpu
->
sim
.
paddedNumberOfAtoms
*
sizeof
(
unsigned
int
));
// Initialize exclusion array
gpu
->
pExclusion
=
new
unsigned
int
[
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
];
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
*
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
gpu
->
pExclusion
[
i
]
=
1
;
return
(
void
*
)
gpu
;
}
extern
"C"
void
gpuSetIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
gpu
->
sim
.
tau
=
tau
;
gpu
->
sim
.
GDT
=
gpu
->
sim
.
deltaT
/
gpu
->
sim
.
tau
;
gpu
->
sim
.
EPH
=
exp
(
0.5
f
*
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EMH
=
exp
(
-
0.5
f
*
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EP
=
exp
(
gpu
->
sim
.
GDT
);
gpu
->
sim
.
EM
=
exp
(
-
gpu
->
sim
.
GDT
);
gpu
->
sim
.
OneMinusEM
=
1.0
f
-
gpu
->
sim
.
EM
;
gpu
->
sim
.
TauOneMinusEM
=
gpu
->
sim
.
tau
*
gpu
->
sim
.
OneMinusEM
;
if
(
gpu
->
sim
.
GDT
>=
0.1
f
)
{
float
term1
=
gpu
->
sim
.
EPH
-
1.0
f
;
term1
*=
term1
;
gpu
->
sim
.
B
=
gpu
->
sim
.
GDT
*
(
gpu
->
sim
.
EP
-
1.0
f
)
-
4.0
f
*
term1
;
gpu
->
sim
.
C
=
gpu
->
sim
.
GDT
-
3.0
f
+
4.0
f
*
gpu
->
sim
.
EMH
-
gpu
->
sim
.
EM
;
gpu
->
sim
.
D
=
2.0
f
-
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
;
}
else
{
float
term1
=
0.5
f
*
gpu
->
sim
.
GDT
;
float
term2
=
term1
*
term1
;
float
term4
=
term2
*
term2
;
float
third
=
1.0
f
/
3.0
f
;
float
o7_9
=
7.0
f
/
9.0
f
;
float
o1_12
=
1.0
f
/
12.0
f
;
float
o17_90
=
17.0
f
/
90.0
f
;
float
o7_30
=
7.0
f
/
30.0
f
;
float
o31_1260
=
31.0
f
/
1260.0
f
;
float
o_360
=
1.0
f
/
360.0
f
;
gpu
->
sim
.
B
=
term4
*
(
third
+
term1
*
(
third
+
term1
*
(
o17_90
+
term1
*
o7_9
)));
gpu
->
sim
.
C
=
term2
*
term1
*
(
2.0
f
*
third
+
term1
*
(
-
0.5
f
+
term1
*
(
o7_30
+
term1
*
(
-
o1_12
+
term1
*
o31_1260
))));
gpu
->
sim
.
D
=
term2
*
(
-
1.0
f
+
term2
*
(
-
o1_12
-
term2
*
o_360
));
}
gpu
->
sim
.
TauDOverEMMinusOne
=
gpu
->
sim
.
tau
*
gpu
->
sim
.
D
/
(
gpu
->
sim
.
EM
-
1.0
f
);
gpu
->
sim
.
DOverTauC
=
gpu
->
sim
.
D
/
(
gpu
->
sim
.
tau
*
gpu
->
sim
.
C
);
gpu
->
sim
.
fix1
=
gpu
->
sim
.
tau
*
(
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
);
gpu
->
sim
.
oneOverFix1
=
1.0
f
/
(
gpu
->
sim
.
tau
*
(
gpu
->
sim
.
EPH
-
gpu
->
sim
.
EMH
));
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
V
=
sqrt
(
gpu
->
sim
.
kT
*
(
1.0
f
-
gpu
->
sim
.
EM
));
gpu
->
sim
.
X
=
gpu
->
sim
.
tau
*
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
C
);
gpu
->
sim
.
Yv
=
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
B
/
gpu
->
sim
.
C
);
gpu
->
sim
.
Yx
=
gpu
->
sim
.
tau
*
sqrt
(
gpu
->
sim
.
kT
*
gpu
->
sim
.
B
/
(
1.0
f
-
gpu
->
sim
.
EM
));
}
extern
"C"
void
gpuSetVerletIntegrationParameters
(
gpuContext
gpu
,
float
deltaT
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
}
extern
"C"
void
gpuSetBrownianIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
)
{
gpu
->
sim
.
deltaT
=
deltaT
;
gpu
->
sim
.
oneOverDeltaT
=
1.0
f
/
deltaT
;
gpu
->
sim
.
tau
=
tau
;
gpu
->
sim
.
GDT
=
gpu
->
sim
.
deltaT
*
gpu
->
sim
.
tau
;
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
Yv
=
gpu
->
sim
.
Yx
=
sqrt
(
2.0
f
*
gpu
->
sim
.
kT
*
deltaT
*
tau
);
}
extern
"C"
void
gpuSetAndersenThermostatParameters
(
gpuContext
gpu
,
float
temperature
,
float
collisionProbability
)
{
gpu
->
sim
.
T
=
temperature
;
gpu
->
sim
.
kT
=
BOLTZ
*
gpu
->
sim
.
T
;
gpu
->
sim
.
collisionProbability
=
collisionProbability
;
gpu
->
sim
.
Yv
=
gpu
->
sim
.
Yx
=
1.0
f
;
gpu
->
sim
.
V
=
gpu
->
sim
.
X
=
1.0
f
;
}
extern
"C"
void
gpuShutDown
(
gpuContext
gpu
)
{
// Delete sysmem pointers
delete
[]
gpu
->
pOutputBufferCounter
;
delete
[]
gpu
->
pExclusion
;
delete
[]
gpu
->
gpAtomTable
;
delete
[]
gpu
->
pAtomSymbol
;
// Delete device pointers
delete
gpu
->
psPosq4
;
delete
gpu
->
psPosqP4
;
delete
gpu
->
psOldPosq4
;
delete
gpu
->
psVelm4
;
delete
gpu
->
psForce4
;
delete
gpu
->
psxVector4
;
delete
gpu
->
psvVector4
;
delete
gpu
->
psSigEps2
;
delete
gpu
->
psObcData
;
delete
gpu
->
psObcChain
;
delete
gpu
->
psBornForce
;
delete
gpu
->
psBornRadii
;
delete
gpu
->
psBornSum
;
delete
gpu
->
psBondID
;
delete
gpu
->
psBondParameter
;
delete
gpu
->
psBondAngleID1
;
delete
gpu
->
psBondAngleID2
;
delete
gpu
->
psBondAngleParameter
;
delete
gpu
->
psDihedralID1
;
delete
gpu
->
psDihedralID2
;
delete
gpu
->
psDihedralParameter
;
delete
gpu
->
psRbDihedralID1
;
delete
gpu
->
psRbDihedralID2
;
delete
gpu
->
psRbDihedralParameter1
;
delete
gpu
->
psRbDihedralParameter2
;
delete
gpu
->
psLJ14ID
;
delete
gpu
->
psLJ14Parameter
;
delete
gpu
->
psShakeID
;
delete
gpu
->
psShakeParameter
;
delete
gpu
->
psExclusion
;
delete
gpu
->
psWorkUnit
;
delete
gpu
->
psRandom4
;
delete
gpu
->
psRandom2
;
delete
gpu
->
psRandomPosition
;
delete
gpu
->
psRandomSeed
;
delete
gpu
->
psLinearMomentum
;
// Wrap up
delete
gpu
;
return
;
}
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
)
{
unsigned
int
outputBuffers
=
gpu
->
sim
.
totalNonbondOutputBuffers
;
for
(
unsigned
int
i
=
0
;
i
<
gpu
->
sim
.
paddedNumberOfAtoms
;
i
++
)
{
if
(
outputBuffers
<
gpu
->
pOutputBufferCounter
[
i
])
{
outputBuffers
=
gpu
->
pOutputBufferCounter
[
i
];
}
}
gpu
->
sim
.
outputBuffers
=
outputBuffers
;
gpu
->
psForce4
=
new
CUDAStream
<
float4
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
outputBuffers
);
gpu
->
psBornForce
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
psBornSum
=
new
CUDAStream
<
float
>
(
gpu
->
sim
.
paddedNumberOfAtoms
,
gpu
->
sim
.
nonbondOutputBuffers
);
gpu
->
sim
.
pForce4
=
gpu
->
psForce4
->
_pDevStream
[
0
];
gpu
->
sim
.
pForce4a
=
gpu
->
sim
.
pForce4
;
gpu
->
sim
.
pForce4b
=
gpu
->
sim
.
pForce4
+
1
*
gpu
->
sim
.
nonbondOutputBuffers
*
gpu
->
sim
.
stride
;
gpu
->
sim
.
pBornForce
=
gpu
->
psBornForce
->
_pDevStream
[
0
];
gpu
->
sim
.
pBornSum
=
gpu
->
psBornSum
->
_pDevStream
[
0
];
// Determine local energy paramter offsets for bonded interactions
gpu
->
sim
.
bond_offset
=
gpu
->
psBondParameter
->
_stride
;
gpu
->
sim
.
bond_angle_offset
=
gpu
->
sim
.
bond_offset
+
gpu
->
psBondAngleParameter
->
_stride
;
gpu
->
sim
.
dihedral_offset
=
gpu
->
sim
.
bond_angle_offset
+
gpu
->
psDihedralParameter
->
_stride
;
gpu
->
sim
.
rb_dihedral_offset
=
gpu
->
sim
.
dihedral_offset
+
gpu
->
psRbDihedralParameter1
->
_stride
;
gpu
->
sim
.
LJ14_offset
=
gpu
->
sim
.
rb_dihedral_offset
+
gpu
->
psLJ14Parameter
->
_stride
;
gpu
->
sim
.
localForces_threads_per_block
=
(
gpu
->
sim
.
LJ14_offset
/
gpu
->
sim
.
blocks
+
15
)
&
0xfffffff0
;
if
(
gpu
->
sim
.
localForces_threads_per_block
>
gpu
->
sim
.
max_localForces_threads_per_block
)
gpu
->
sim
.
localForces_threads_per_block
=
gpu
->
sim
.
max_localForces_threads_per_block
;
if
(
gpu
->
sim
.
localForces_threads_per_block
<
1
)
gpu
->
sim
.
localForces_threads_per_block
=
1
;
// Flip local force output buffers
int
flip
=
outputBuffers
-
1
;
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
bonds
;
i
++
)
{
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psBondID
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
bond_angles
;
i
++
)
{
gpu
->
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psBondAngleID1
->
_pSysStream
[
0
][
i
].
w
;
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psBondAngleID2
->
_pSysStream
[
0
][
i
].
y
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
dihedrals
;
i
++
)
{
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
y
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psDihedralID2
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
rb_dihedrals
;
i
++
)
{
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
x
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
y
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psRbDihedralID2
->
_pSysStream
[
0
][
i
].
w
;
}
for
(
int
i
=
0
;
i
<
(
int
)
gpu
->
sim
.
LJ14s
;
i
++
)
{
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
=
flip
-
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
z
;
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
=
flip
-
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
i
].
w
;
}
gpu
->
psBondID
->
Upload
();
gpu
->
psBondAngleID1
->
Upload
();
gpu
->
psBondAngleID2
->
Upload
();
gpu
->
psDihedralID2
->
Upload
();
gpu
->
psRbDihedralID2
->
Upload
();
gpu
->
psLJ14ID
->
Upload
();
return
1
;
}
extern
"C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
)
{
const
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
const
unsigned
int
grid
=
gpu
->
grid
;
const
unsigned
int
dim
=
(
atoms
+
(
grid
-
1
))
/
grid
;
const
unsigned
int
cells
=
dim
*
(
dim
+
1
)
/
2
;
const
unsigned
int
*
pExclusion
=
gpu
->
pExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
=
new
CUDAStream
<
unsigned
int
>
(
cells
,
1u
);
unsigned
int
*
pWorkList
=
psWorkUnit
->
_pSysStream
[
0
];
gpu
->
psWorkUnit
=
psWorkUnit
;
gpu
->
sim
.
pWorkUnit
=
psWorkUnit
->
_pDevStream
[
0
];
gpu
->
sim
.
nonbond_workBlock
=
gpu
->
sim
.
nonbond_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
workUnits
=
cells
;
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
int
minimumBlocks
=
(
cells
+
gpu
->
sim
.
workUnitsPerSM
-
1
)
/
gpu
->
sim
.
workUnitsPerSM
;
if
((
int
)
gpu
->
sim
.
nonbond_blocks
<
minimumBlocks
)
{
gpu
->
sim
.
nonbond_blocks
=
gpu
->
sim
.
nonbond_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
nonbond_blocks
-
1
)
/
gpu
->
sim
.
nonbond_blocks
);
}
if
((
int
)
gpu
->
sim
.
bornForce2_blocks
<
minimumBlocks
)
{
gpu
->
sim
.
bornForce2_blocks
=
gpu
->
sim
.
bornForce2_blocks
*
((
minimumBlocks
+
gpu
->
sim
.
bornForce2_blocks
-
1
)
/
gpu
->
sim
.
bornForce2_blocks
);
}
gpu
->
sim
.
nbWorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nbWorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nbWorkUnitsPerBlock
;
gpu
->
sim
.
bf2WorkUnitsPerBlock
=
cells
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bf2WorkUnitsPerBlockRemainder
=
cells
-
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bf2WorkUnitsPerBlock
;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
int
activeWorkUnits
=
gpu
->
sim
.
nonbond_blocks
*
gpu
->
sim
.
nonbond_workBlock
;
if
(
activeWorkUnits
>
(
int
)
cells
)
{
int
balancedWorkBlock
=
(
cells
+
gpu
->
sim
.
nonbond_blocks
-
1
)
/
gpu
->
sim
.
nonbond_blocks
;
gpu
->
sim
.
nonbond_threads_per_block
=
balancedWorkBlock
*
GRID
;
gpu
->
sim
.
nonbond_workBlock
=
balancedWorkBlock
;
}
activeWorkUnits
=
gpu
->
sim
.
bornForce2_blocks
*
gpu
->
sim
.
bornForce2_workBlock
;
if
(
activeWorkUnits
>
(
int
)
cells
)
{
int
balancedWorkBlock
=
(
cells
+
gpu
->
sim
.
bornForce2_blocks
-
1
)
/
gpu
->
sim
.
bornForce2_blocks
;
gpu
->
sim
.
bornForce2_threads_per_block
=
balancedWorkBlock
*
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
balancedWorkBlock
;
}
unsigned
int
count
=
0
;
for
(
unsigned
int
y
=
0
;
y
<
dim
;
y
++
)
{
for
(
unsigned
int
x
=
y
;
x
<
dim
;
x
++
)
{
pWorkList
[
count
]
=
(
x
<<
17
)
|
(
y
<<
2
);
// Check for exclusions
int
exclusions
=
0
;
for
(
unsigned
int
i
=
y
*
grid
;
i
<
y
*
grid
+
grid
;
i
++
)
{
for
(
unsigned
int
j
=
x
*
grid
;
j
<
x
*
grid
+
grid
;
j
++
)
{
if
(
!
pExclusion
[
i
*
atoms
+
j
])
{
exclusions
++
;
}
}
}
// Signal exclusions if they exist
if
(
exclusions
>
0
)
pWorkList
[
count
]
|=
0x1
;
count
++
;
}
}
psWorkUnit
->
Upload
();
gpuSetConstants
(
gpu
);
return
cells
;
}
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
)
{
unsigned
int
atoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
CUDAStream
<
unsigned
int
>*
psExclusion
=
new
CUDAStream
<
unsigned
int
>
(
atoms
*
atoms
,
1u
);
gpu
->
psExclusion
=
psExclusion
;
gpu
->
sim
.
pExclusion
=
psExclusion
->
_pDevStream
[
0
];
unsigned
int
*
pExList
=
psExclusion
->
_pSysStream
[
0
];
int
exclusions
=
0
;
unsigned
int
pos
=
0
;
for
(
unsigned
int
x
=
0
;
x
<
atoms
;
x
+=
gpu
->
grid
)
{
for
(
unsigned
int
y
=
0
;
y
<
atoms
;
y
+=
gpu
->
grid
)
{
for
(
unsigned
x1
=
x
;
x1
<
x
+
gpu
->
grid
;
x1
++
)
{
unsigned
int
mask
=
0
;
for
(
unsigned
int
y1
=
y
;
y1
<
y
+
gpu
->
grid
;
y1
++
)
{
mask
>>=
1
;
if
(
gpu
->
pExclusion
[
x1
*
atoms
+
y1
]
==
0
)
{
if
(
x1
>=
y1
)
exclusions
++
;
}
else
mask
|=
0x80000000
;
}
pExList
[
pos
++
]
=
mask
;
}
}
}
psExclusion
->
Upload
();
gpuSetConstants
(
gpu
);
return
exclusions
;
}
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
)
{
SetCalculateCDLJForcesSim
(
gpu
);
SetCalculateCDLJObcGbsaForces1Sim
(
gpu
);
SetCalculateLocalForcesSim
(
gpu
);
SetCalculateObcGbsaBornSumSim
(
gpu
);
SetCalculateObcGbsaForces1Sim
(
gpu
);
SetCalculateObcGbsaForces2Sim
(
gpu
);
SetCalculateAndersenThermostatSim
(
gpu
);
SetForcesSim
(
gpu
);
SetUpdateShakeHSim
(
gpu
);
SetVerletUpdateSim
(
gpu
);
SetBrownianUpdateSim
(
gpu
);
SetRandomSim
(
gpu
);
if
(
gpu
->
sm_version
>=
SM_12
)
{
SetCalculateCDLJForces_12Sim
(
gpu
);
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces1_12Sim
(
gpu
);
SetCalculateObcGbsaForces2_12Sim
(
gpu
);
}
return
1
;
}
extern
"C"
void
gpuDumpCoordinates
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
(
void
)
printf
(
"
\n\n
Coordinates and velocities
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
w
);
}
}
bool
ISNAN
(
float
f
)
{
return
!
(
f
==
f
);
}
extern
"C"
bool
gpuCheckData
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psVelm4
->
Download
();
gpu
->
psForce4
->
Download
();
gpu
->
psBornForce
->
Download
();
int
violations
=
0
;
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
if
(
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
)
||
ISNAN
(
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
)
||
ISNAN
(
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]))
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psVelm4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
);
violations
++
;
}
}
if
(
violations
>
0
)
{
printf
(
"%d total violations
\n
"
,
violations
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
float
dmin
=
99999999.0
f
;
int
closest
=
-
9999
;
float
x
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
;
float
y
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
;
float
z
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
;
for
(
int
j
=
0
;
j
<
gpu
->
natoms
;
j
++
)
{
if
(
j
!=
i
)
{
float
dx
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
x
-
x
;
float
dy
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
y
-
y
;
float
dz
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
j
].
z
-
z
;
float
r
=
sqrt
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
if
(
r
<
dmin
)
{
dmin
=
r
;
closest
=
j
;
}
}
}
printf
(
"Atom %4d: Closest neighbor is Atom %4d, %11.5e
\n
"
,
i
,
closest
,
dmin
);
}
gpuDumpAtomData
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCPUCalculateLocalForces
(
gpu
);
// Determine which forces have gone awry
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateCDLJForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Nonbond Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateObcGbsaForces1
(
gpu
);
kReduceObcGbsaBornForces
(
gpu
);
kCalculateObcGbsaForces2
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"OBC Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kCalculateLocalForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Local Forces
\n
"
);
gpuDumpForces
(
gpu
);
kClearBornForces
(
gpu
);
kClearForces
(
gpu
);
kReduceForces
(
gpu
);
printf
(
"Cleared Forces
\n
"
);
gpuDumpForces
(
gpu
);
return
false
;
}
return
true
;
}
extern
"C"
void
kCPUCalculate14
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psForce4
->
Download
();
// gpu->psLJ14ID->Download();
// gpu->psLJ14Parameter->Download();
for
(
int
pos
=
0
;
pos
<
(
int
)
gpu
->
sim
.
LJ14s
;
pos
++
)
{
int4
atom
=
gpu
->
psLJ14ID
->
_pSysStream
[
0
][
pos
];
float4
LJ14
=
gpu
->
psLJ14Parameter
->
_pSysStream
[
0
][
pos
];
float4
a1
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
x
];
float4
a2
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
d
.
x
*
d
.
x
+
d
.
y
*
d
.
y
+
d
.
z
*
d
.
z
;
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
gpu
->
sim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
gpu
->
sim
.
stride
;
float4
forceA
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
];
float4
forceB
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
]
=
forceA
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
]
=
forceB
;
printf
(
"%4d: %4d - %4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
pos
,
atom
.
x
,
atom
.
y
,
r2
,
dEdR
,
sig2
,
sig6
,
LJ14
.
x
,
LJ14
.
z
);
}
}
extern
"C"
void
gpuDumpPrimeCoordinates
(
gpuContext
gpu
)
{
gpu
->
psPosqP4
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosqP4
->
_pSysStream
[
0
][
i
].
w
);
}
}
extern
"C"
void
gpuDumpForces
(
gpuContext
gpu
)
{
gpu
->
psForce4
->
Download
();
gpu
->
psBornForce
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
char
buff
[
512
];
sprintf
(
buff
,
"%4d: %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
);
// OutputDebugString(buff);
}
}
extern
"C"
void
gpuDumpAtomData
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psSigEps2
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psObcChain
->
Download
();
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
char
buff
[
512
];
sprintf
(
buff
,
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psSigEps2
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]
);
// OutputDebugString((LPCWSTR)buff);
}
}
extern
"C"
void
gpuSetup
(
void
*
pVoid
)
{
gpuContext
gpu
=
(
gpuContext
)
pVoid
;
// Read parameters
cout
<<
gpuReadAtomicParameters
(
gpu
,
"Data/atomicradii.txt"
)
<<
" atom types
\n
"
;
cout
<<
gpuReadBondParameters
(
gpu
,
"Data/GromacsHarmonicBondParameter.txt"
)
<<
" bond parameters.
\n
"
;
cout
<<
gpuReadBondAngleParameters
(
gpu
,
"Data/GromacsAngleBondParameter.txt"
)
<<
" bond angle parameters.
\n
"
;
cout
<<
gpuReadDihedralParameters
(
gpu
,
"Data/GromacsProperDihedralParameter.txt"
)
<<
" proper dihedral parameters.
\n
"
;
cout
<<
gpuReadRbDihedralParameters
(
gpu
,
"Data/GromacsRbDihedralParameter.txt"
)
<<
" Ryckaert-Bellemans dihedral parameters.
\n
"
;
cout
<<
gpuReadLJ14Parameters
(
gpu
,
"Data/GromacsLJ14Parameter.txt"
)
<<
" Lennard-Jones 1-4 parameters.
\n
"
;
cout
<<
gpuReadCoulombParameters
(
gpu
,
"Data/GromacsLJCoulombParameter.txt"
)
<<
" Coulomb parameters.
\n
"
;
cout
<<
gpuReadShakeParameters
(
gpu
,
"Data/GromacsShakeParameters.txt"
)
<<
" shake parameters.
\n
"
;
// Build thread block work list
gpuBuildThreadBlockWorkList
(
gpu
);
// Build exclusion list
gpuBuildExclusionList
(
gpu
);
// Create output buffers
gpuBuildOutputBuffers
(
gpu
);
// Set constant blocks
gpuSetConstants
(
gpu
);
// Initialize randoms
gpuInitializeRandoms
(
gpu
);
// Initialize Born Radii;
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
kClearForces
(
gpu
);
kClearBornForces
(
gpu
);
return
;
}
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{
\
dp
=
DOT3
(
v1
,
v2
);
\
float
norm1
=
DOT3
(
v1
,
v1
);
\
float
norm2
=
DOT3
(
v2
,
v2
);
\
dp
/=
sqrt
(
norm1
*
norm2
);
\
dp
=
min
(
dp
,
1.0
f
);
\
dp
=
max
(
dp
,
-
1.0
f
);
\
}
#define CROSS_PRODUCT(v1, v2, c) \
c
.
x
=
v1
.
y
*
v2
.
z
-
v1
.
z
*
v2
.
y
;
\
c
.
y
=
v1
.
z
*
v2
.
x
-
v1
.
x
*
v2
.
z
;
\
c
.
z
=
v1
.
x
*
v2
.
y
-
v1
.
y
*
v2
.
x
;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acos
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
3.14159265
f
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
;
\
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{
\
float
dp
;
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
dp
);
\
angle
=
acos
(
dp
);
\
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
cosine
);
\
angle
=
acos
(
cosine
);
\
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLECOSINEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
,
cosine
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
// Calculate Local forces on CPU
extern
"C"
void
kCPUCalculateLocalForces
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psForce4
->
Download
();
gpu
->
psBondID
->
Download
();
gpu
->
psBondParameter
->
Download
();
gpu
->
psBondAngleID1
->
Download
();
gpu
->
psBondAngleID2
->
Download
();
gpu
->
psBondAngleParameter
->
Download
();
gpu
->
psDihedralID1
->
Download
();
gpu
->
psDihedralID2
->
Download
();
gpu
->
psDihedralParameter
->
Download
();
gpu
->
psRbDihedralID1
->
Download
();
gpu
->
psRbDihedralID2
->
Download
();
gpu
->
psRbDihedralParameter1
->
Download
();
gpu
->
psRbDihedralParameter2
->
Download
();
gpu
->
psLJ14ID
->
Download
();
gpu
->
psLJ14Parameter
->
Download
();
unsigned
int
pos
=
0
;
Vectors
V
;
Vectors
*
A
=
&
V
;
int
violations
=
0
;
while
(
pos
<
gpu
->
sim
.
bond_offset
)
{
if
(
pos
<
gpu
->
sim
.
bonds
)
{
int4
atom
=
gpu
->
psBondID
->
_pSysStream
[
0
][
pos
];
float4
atomA
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
x
];
float4
atomB
=
gpu
->
psPosq4
->
_pSysStream
[
0
][
atom
.
y
];
float2
bond
=
gpu
->
psBondParameter
->
_pSysStream
[
0
][
pos
];
float
dx
=
atomB
.
x
-
atomA
.
x
;
float
dy
=
atomB
.
y
-
atomA
.
y
;
float
dz
=
atomB
.
z
-
atomA
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
float
deltaIdeal
=
r
-
bond
.
x
;
float
dEdR
=
bond
.
y
*
deltaIdeal
;
dEdR
=
(
r
>
0.0
f
)
?
(
dEdR
/
r
)
:
0.0
f
;
if
(
fabs
(
deltaIdeal
)
>
1.0
f
)
{
printf
(
"Bond %4d: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f
\n
"
,
pos
,
dx
,
dy
,
dz
,
r
,
deltaIdeal
,
dEdR
);
violations
++
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
gpu
->
sim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
gpu
->
sim
.
stride
;
float4
forceA
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
];
float4
forceB
=
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetA
]
=
forceA
;
gpu
->
psForce4
->
_pSysStream
[
0
][
offsetB
]
=
forceB
;
}
pos
++
;
}
#if 0
while (pos < gpu->sim.bond_angle_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_offset;
if (pos1 < gpu->sim.bond_angles)
{
int4 atom1 = gpu->psBondAngleID1->_pSysStream[0][pos1];
float2 bond_angle = gpu->psBondAngleParameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 a3 = gpu->psPosq4->_pSysStream[0][atom1.z];
A->v0.x = a2.x - a1.x;
A->v0.y = a2.y - a1.y;
A->v0.z = a2.z - a1.z;
A->v1.x = a2.x - a3.x;
A->v1.y = a2.y - a3.y;
A->v1.z = a2.z - a3.z;
float3 cp;
CROSS_PRODUCT(A->v0, A->v1, cp);
float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
rp = max(sqrt(rp), 1.0e-06f);
float r21 = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = dot / sqrt(r21 * r23);
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
printf("Bond angle %4d %11.4f %11.4f\n", pos1, cosine, dEdR);
float termA = dEdR / (r21 * rp);
float termC = -dEdR / (r23 * rp);
float3 c21;
float3 c23;
CROSS_PRODUCT(A->v0, cp, c21);
CROSS_PRODUCT(A->v1, cp, c23);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
int2 atom2 = gpu->psBondAngleID2->_pSysStream[0][pos1];
unsigned int offset = atom1.x + atom1.w * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
force.x += c21.x;
force.y += c21.y;
force.z += c21.z;
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.y + atom2.x * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x -= (c21.x + c23.x);
force.y -= (c21.y + c23.y);
force.z -= (c21.z + c23.z);
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.z + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += c23.x;
force.y += c23.y;
force.z += c23.z;
gpu->psForce4->_pSysStream[0][offset] = force;
}
pos++;
}
while (pos < gpu->sim.dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_angle_offset;
if (pos1 < gpu->sim.dihedrals)
{
int4 atom1 = gpu->psDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = gpu->psDihedralParameter->_pSysStream[0][pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.rb_dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.dihedral_offset;
if (pos1 < gpu->sim.rb_dihedrals)
{
int4 atom1 = gpu->psRbDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle, cosPhi;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += 3.14159265f;
}
else
{
dihedralAngle -= 3.14159265f;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4 dihedral1 = gpu->psRbDihedralParameter1->_pSysStream[0][pos1];
float2 dihedral2 = gpu->psRbDihedralParameter2->_pSysStream[0][pos1];
float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sin(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psRbDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
printf("RB Dihedral %4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.LJ14_offset)
{
unsigned int pos1 = pos - gpu->sim.rb_dihedral_offset;
if (pos1 < gpu->sim.LJ14s)
{
int4 atom = gpu->psLJ14ID->_pSysStream[0][pos1];
float4 LJ14 = gpu->psLJ14Parameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * gpu->sim.stride;
unsigned int offsetB = atom.y + atom.w * gpu->sim.stride;
float4 forceA = gpu->psForce4->_pSysStream[0][offsetA];
float4 forceB = gpu->psForce4->_pSysStream[0][offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
printf("LJ14 %d: %11.4f %11.4f %11.4f\n", pos1, d.x, d.y, d.z);
gpu->psForce4->_pSysStream[0][offsetA] = forceA;
gpu->psForce4->_pSysStream[0][offsetB] = forceB;
}
pos++;
}
#endif
if
(
violations
>
0
)
{
gpuDumpCoordinates
(
gpu
);
gpuDumpForces
(
gpu
);
}
}
static
FILE
*
getWriteToFilePtr
(
char
*
fname
,
int
step
)
{
std
::
stringstream
fileName
;
fileName
<<
fname
<<
"_"
;
fileName
<<
step
;
fileName
<<
".txt"
;
FILE
*
filePtr
=
fopen
(
fileName
.
str
().
c_str
(),
"w"
);
if
(
filePtr
==
NULL
){
(
void
)
fprintf
(
stderr
,
"Could not open file=<%s> for writitng."
,
fileName
.
str
().
c_str
()
);
exit
(
-
1
);
}
return
filePtr
;
}
extern
"C"
{
static
void
printValues
(
FILE
*
filePtr
,
int
index
,
int
numberOfValues
,
float
*
values
)
{
int
i
;
(
void
)
fprintf
(
filePtr
,
"%5d "
,
index
);
for
(
i
=
0
;
i
<
numberOfValues
;
i
++
)
{
(
void
)
fprintf
(
filePtr
,
" %18.10e"
,
values
[
i
]
);
}
(
void
)
fprintf
(
filePtr
,
"
\n
"
);
(
void
)
fflush
(
filePtr
);
}
}
extern
"C"
void
WriteArrayToFile1
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
1
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
];
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
];
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
WriteArrayToFile2
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float2
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
2
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
WriteArrayToFile4
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float4
>*
psPos
,
int
numPrint
)
{
int
i
;
static
const
int
numberOfValues
=
4
;
FILE
*
filePtr
=
getWriteToFilePtr
(
fname
,
step
);
float
values
[
numberOfValues
];
psPos
->
Download
();
numPrint
=
(
numPrint
>
0
&&
(
numPrint
<
gpu
->
natoms
))
?
numPrint
:
gpu
->
natoms
;
for
(
i
=
0
;
i
<
numPrint
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
values
[
2
]
=
psPos
->
_pSysStream
[
0
][
i
].
z
;
values
[
3
]
=
psPos
->
_pSysStream
[
0
][
i
].
w
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
for
(
i
=
gpu
->
natoms
-
numPrint
;
i
<
gpu
->
natoms
;
i
++
)
{
values
[
0
]
=
psPos
->
_pSysStream
[
0
][
i
].
x
;
values
[
1
]
=
psPos
->
_pSysStream
[
0
][
i
].
y
;
values
[
2
]
=
psPos
->
_pSysStream
[
0
][
i
].
z
;
values
[
3
]
=
psPos
->
_pSysStream
[
0
][
i
].
w
;
printValues
(
filePtr
,
i
,
numberOfValues
,
values
);
}
(
void
)
fclose
(
filePtr
);
}
extern
"C"
void
gpuDumpObcInfo
(
gpuContext
gpu
)
{
gpu
->
psPosq4
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psObcData
->
Download
();
gpu
->
psBornSum
->
Download
();
printf
(
"
\n\n
Obc Info xyzw Brad atomR scaledAtomR
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
z
,
gpu
->
psPosq4
->
_pSysStream
[
0
][
i
].
w
,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
gpu
->
psBornSum
->
_pSysStream
[
0
][
i
],
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psObcData
->
_pSysStream
[
0
][
i
].
y
);
}
}
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
)
{
float
compF
;
gpu
->
psForce4
->
Download
();
gpu
->
psBornRadii
->
Download
();
gpu
->
psBornForce
->
Download
();
gpu
->
psObcChain
->
Download
();
gpu
->
psBornSum
->
Download
();
printf
(
"
\n\n
Obc F3 BrnR BrnF Chn
\n
"
);
for
(
int
i
=
0
;
i
<
gpu
->
natoms
;
i
++
)
{
compF
=
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
]
/
(
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
*
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
]
*
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]);
printf
(
"%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f
\n
"
,
i
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
x
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
y
,
gpu
->
psForce4
->
_pSysStream
[
0
][
i
].
z
,
// gpu->psForce4->_pSysStream[0][i].w,
gpu
->
psBornRadii
->
_pSysStream
[
0
][
i
],
compF
,
gpu
->
psBornForce
->
_pSysStream
[
0
][
i
],
// gpu->psBornSum->_pSysStream[0][i],
gpu
->
psObcChain
->
_pSysStream
[
0
][
i
]
);
}
}
platforms/cuda/src/kernels/gputypes.h
0 → 100755
View file @
38f6c8f8
#ifndef __GPUTYPES_H__
#define __GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include <vector>
struct
gpuAtomType
{
string
name
;
char
symbol
;
float
r
;
};
enum
SM_VERSION
{
SM_10
,
SM_11
,
SM_12
};
/* Pointer to this structure will be given
* to gromacs functions*/
struct
_gpuContext
{
//Cache this here so that it doesn't
//have to be repeatedly passed around
int
natoms
;
gpuAtomType
*
gpAtomTable
;
int
gAtomTypes
;
cudaGmxSimulation
sim
;
unsigned
int
*
pOutputBufferCounter
;
unsigned
int
*
pExclusion
;
unsigned
char
*
pAtomSymbol
;
float
iterations
;
float
epsfac
;
float
solventDielectric
;
float
soluteDielectric
;
int
grid
;
bool
bCalculateCM
;
bool
bRemoveCM
;
bool
bRecalculateBornRadii
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
CUDAStream
<
float4
>*
psVelm4
;
CUDAStream
<
float4
>*
psForce4
;
CUDAStream
<
float4
>*
psxVector4
;
CUDAStream
<
float4
>*
psvVector4
;
CUDAStream
<
float2
>*
psSigEps2
;
CUDAStream
<
float2
>*
psObcData
;
CUDAStream
<
float
>*
psObcChain
;
CUDAStream
<
float
>*
psBornForce
;
CUDAStream
<
float
>*
psBornRadii
;
CUDAStream
<
float
>*
psBornSum
;
CUDAStream
<
int4
>*
psBondID
;
CUDAStream
<
float2
>*
psBondParameter
;
CUDAStream
<
int4
>*
psBondAngleID1
;
CUDAStream
<
int2
>*
psBondAngleID2
;
CUDAStream
<
float2
>*
psBondAngleParameter
;
CUDAStream
<
int4
>*
psDihedralID1
;
CUDAStream
<
int4
>*
psDihedralID2
;
CUDAStream
<
float4
>*
psDihedralParameter
;
CUDAStream
<
int4
>*
psRbDihedralID1
;
CUDAStream
<
int4
>*
psRbDihedralID2
;
CUDAStream
<
float4
>*
psRbDihedralParameter1
;
CUDAStream
<
float2
>*
psRbDihedralParameter2
;
CUDAStream
<
int4
>*
psLJ14ID
;
CUDAStream
<
float4
>*
psLJ14Parameter
;
CUDAStream
<
int
>*
psNonShakeID
;
CUDAStream
<
int4
>*
psShakeID
;
CUDAStream
<
float4
>*
psShakeParameter
;
CUDAStream
<
unsigned
int
>*
psExclusion
;
CUDAStream
<
unsigned
int
>*
psWorkUnit
;
CUDAStream
<
float4
>*
psRandom4
;
// Pointer to sets of 4 random numbers for MD integration
CUDAStream
<
float2
>*
psRandom2
;
// Pointer to sets of 2 random numbers for MD integration
CUDAStream
<
uint4
>*
psRandomSeed
;
// Pointer to each random seed
CUDAStream
<
int
>*
psRandomPosition
;
// Pointer to random number positions
CUDAStream
<
float4
>*
psLinearMomentum
;
// Pointer to total linear momentum per CTA
};
typedef
struct
_gpuContext
*
gpuContext
;
// Function prototypes
extern
"C"
bool
gpuIsAvailable
();
extern
"C"
int
gpuReadBondParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetBondParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
length
,
const
std
::
vector
<
float
>&
k
);
extern
"C"
int
gpuReadBondAngleParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetBondAngleParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
float
>&
angle
,
const
std
::
vector
<
float
>&
k
);
extern
"C"
int
gpuReadDihedralParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetDihedralParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
int
>&
atom4
,
const
std
::
vector
<
float
>&
k
,
const
std
::
vector
<
float
>&
phase
,
const
std
::
vector
<
int
>&
periodicity
);
extern
"C"
int
gpuReadRbDihedralParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetRbDihedralParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
int
>&
atom3
,
const
std
::
vector
<
int
>&
atom4
,
const
std
::
vector
<
float
>&
c0
,
const
std
::
vector
<
float
>&
c1
,
const
std
::
vector
<
float
>&
c2
,
const
std
::
vector
<
float
>&
c3
,
const
std
::
vector
<
float
>&
c4
,
const
std
::
vector
<
float
>&
c5
);
extern
"C"
int
gpuReadLJ14Parameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetLJ14Parameters
(
gpuContext
gpu
,
float
epsfac
,
float
fudge
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q1
,
const
std
::
vector
<
float
>&
q2
);
extern
"C"
float
gpuGetAtomicRadius
(
gpuContext
gpu
,
string
s
);
extern
"C"
unsigned
char
gpuGetAtomicSymbol
(
gpuContext
gpu
,
string
s
);
extern
"C"
int
gpuReadAtomicParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
int
gpuReadCoulombParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetCoulombParameters
(
gpuContext
gpu
,
float
epsfac
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
c6
,
const
std
::
vector
<
float
>&
c12
,
const
std
::
vector
<
float
>&
q
,
const
std
::
vector
<
char
>&
symbol
,
const
std
::
vector
<
vector
<
int
>
>&
exclusions
);
extern
"C"
void
gpuSetObcParameters
(
gpuContext
gpu
,
float
innerDielectric
,
float
solventDielectric
,
const
std
::
vector
<
int
>&
atom
,
const
std
::
vector
<
float
>&
radius
,
const
std
::
vector
<
float
>&
scale
);
extern
"C"
int
gpuReadShakeParameters
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetShakeParameters
(
gpuContext
gpu
,
const
std
::
vector
<
int
>&
atom1
,
const
std
::
vector
<
int
>&
atom2
,
const
std
::
vector
<
float
>&
distance
,
const
std
::
vector
<
float
>&
invMass1
,
const
std
::
vector
<
float
>&
invMass2
,
float
tolerance
);
extern
"C"
int
gpuAllocateInitialBuffers
(
gpuContext
gpu
);
extern
"C"
void
gpuReadCoordinates
(
gpuContext
gpu
,
char
*
fname
);
extern
"C"
void
gpuSetPositions
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
x
,
const
std
::
vector
<
float
>&
y
,
const
std
::
vector
<
float
>&
z
);
extern
"C"
void
gpuSetVelocities
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
x
,
const
std
::
vector
<
float
>&
y
,
const
std
::
vector
<
float
>&
z
);
extern
"C"
void
gpuSetMass
(
gpuContext
gpu
,
const
std
::
vector
<
float
>&
mass
);
extern
"C"
void
gpuInitializeRandoms
(
gpuContext
gpu
);
extern
"C"
void
*
gpuInitFromFile
(
char
*
fname
);
extern
"C"
void
*
gpuInit
(
int
numAtoms
);
extern
"C"
void
gpuSetIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
);
extern
"C"
void
gpuSetVerletIntegrationParameters
(
gpuContext
gpu
,
float
deltaT
);
extern
"C"
void
gpuSetBrownianIntegrationParameters
(
gpuContext
gpu
,
float
tau
,
float
deltaT
,
float
temperature
);
extern
"C"
void
gpuSetAndersenThermostatParameters
(
gpuContext
gpu
,
float
temperature
,
float
collisionProbability
);
extern
"C"
void
gpuShutDown
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildOutputBuffers
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildThreadBlockWorkList
(
gpuContext
gpu
);
extern
"C"
int
gpuBuildExclusionList
(
gpuContext
gpu
);
extern
"C"
int
gpuSetConstants
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpCoordinates
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpPrimeCoordinates
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpForces
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpAtomData
(
gpuContext
gpu
);
extern
"C"
bool
gpuCheckData
(
gpuContext
gpu
);
extern
"C"
void
gpuSetup
(
void
*
pVoid
);
extern
"C"
void
kCPUCalculate14
(
gpuContext
gpu
);
extern
"C"
void
kCPUCalculateLocalForces
(
gpuContext
gpu
);
extern
"C"
void
WriteArrayToFile1
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile2
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float2
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile3
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float3
>*
psPos
,
int
numPrint
);
extern
"C"
void
WriteArrayToFile4
(
gpuContext
gpu
,
char
*
fname
,
int
step
,
CUDAStream
<
float4
>*
psPos
,
int
numPrint
);
extern
"C"
void
gpuDumpObcInfo
(
gpuContext
gpu
);
extern
"C"
void
gpuDumpObcLoop1
(
gpuContext
gpu
);
#endif //__GPUTYPES_H__
platforms/cuda/src/kernels/kBrownianUpdate.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define DeltaShake
static
__constant__
cudaGmxSimulation
cSim
;
void
SetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetBrownianUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kBrownianUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
force
.
x
*
cSim
.
GDT
+
random4a
.
x
;
apos
.
y
+=
force
.
y
*
cSim
.
GDT
+
random4a
.
y
;
apos
.
z
+=
force
.
z
*
cSim
.
GDT
+
random4a
.
z
;
#else
apos
.
x
=
force
.
x
*
cSim
.
GDT
+
random4a
.
x
;
apos
.
y
=
force
.
y
*
cSim
.
GDT
+
random4a
.
y
;
apos
.
z
=
force
.
z
*
cSim
.
GDT
+
random4a
.
z
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kBrownianUpdatePart1
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart1\n");
kBrownianUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart1"
);
}
__global__
void
kBrownianUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kBrownianUpdatePart2
(
gpuContext
gpu
)
{
// printf("kBrownianUpdatePart2\n");
kBrownianUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kBrownianUpdatePart2"
);
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateAndersenThermostatSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateAndersenThermostat_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float
scale
=
(
random4a
.
w
<
cSim
.
collisionProbability
?
0.0
:
1.0
);
float
add
=
(
1.0
-
scale
)
*
sqrt
(
cSim
.
kT
*
velocity
.
w
);
velocity
.
x
=
scale
*
velocity
.
x
+
add
*
random4a
.
x
;
velocity
.
y
=
scale
*
velocity
.
y
+
add
*
random4a
.
y
;
velocity
.
z
=
scale
*
velocity
.
z
+
add
*
random4a
.
z
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kCalculateAndersenThermostat
(
gpuContext
gpu
)
{
// printf("kCalculateAndersenThermostat\n");
kCalculateAndersenThermostat_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateAndersenThermostat"
);
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
fx
;
float
fy
;
float
fz
;
float
eps2
;
float
sig2
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJForces_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
sig2
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps2
=
a
.
y
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
psA
[
tgx
].
sig2
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
psA
[
tgx
].
eps2
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJForces_12_kernel
();
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJForces_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJForces"
);
}
\ No newline at end of file
platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
fx
;
float
fy
;
float
fz
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJForces_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJForces_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJForces_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float3
af
;
// Local atom fx, fy, fz
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
invR
;
float
sig
;
float
sig2
;
float
sig6
;
float
eps
;
float
dEdR
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
j
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
j
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
apos
.
w
*=
cSim
.
epsfac
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
invR
=
1.0
f
/
sqrt
(
r2
);
sig
=
a
.
x
+
psA
[
tj
].
sig
;
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig6
=
sig2
*
sig2
*
sig2
;
eps
=
a
.
y
*
psA
[
tj
].
eps
;
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
of
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateCDLJForces_12
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJForces_12\n");
kCalculateCDLJForces_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJForces_12"
);
}
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
float
q2
;
float
junk
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJObcGbsaForces1_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
apos
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
apos
.
w
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
cSim
.
epsfac
*
temp
.
w
;
sA
[
threadIdx
.
x
].
q2
=
cSim
.
preFactor
*
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q2
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateCDLJObcGbsaForces1_12_kernel
();
void
kCalculateCDLJObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if
(
gpu
->
bRecalculateBornRadii
){
kCalculateObcGbsaBornSum
(
gpu
);
kReduceObcGbsaBornSum
(
gpu
);
}
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateCDLJObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
// int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1"
);
}
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
sig
;
float
eps
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateCDLJObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateCDLJObcGbsaForces1_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
bool
bExclusionFlag
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pAttr
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
!
bExclusionFlag
)
{
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
float
q2
=
apos
.
w
*
cSim
.
preFactor
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
else
// bExclusion
{
// Read exclusion data
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
sA
[
threadIdx
.
x
].
sig
=
a
.
x
;
sA
[
threadIdx
.
x
].
eps
=
a
.
y
;
sA
[
threadIdx
.
x
].
br
=
br
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
j
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
j
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
j
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add Forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
excl
>>=
1
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned
int
excl
=
cSim
.
pExclusion
[
x
*
cSim
.
exclusionStride
+
y
+
tgx
];
float4
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
int
j
=
y
+
tgx
;
float
q2
=
cSim
.
preFactor
*
apos
.
w
;
apos
.
w
*=
cSim
.
epsfac
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pAttr
[
j
];
sA
[
threadIdx
.
x
].
br
=
cSim
.
pBornRadii
[
j
];
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
GRID
-
tgx
));
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
sig
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
eps
=
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
// CDLJ part
float
invR
=
1.0
f
/
sqrt
(
r2
);
float
sig
=
a
.
x
+
psA
[
tj
].
sig
;
float
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
a
.
y
*
psA
[
tj
].
eps
;
float
dEdR
=
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
apos
.
w
*
psA
[
tj
].
q
*
invR
;
dEdR
*=
invR
*
invR
;
if
(
!
(
excl
&
0x1
))
{
dEdR
=
0.0
f
;
}
// ObcGbsaForce1 part
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
q2
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
dEdR
+=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
// Add forces
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
excl
>>=
1
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
sA
[
threadIdx
.
x
].
fb
;
}
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateCDLJObcGbsaForces1_12
(
gpuContext
gpu
)
{
// printf("kCalculateCDLJObcGbsaForces1_12\n");
kCalculateCDLJObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateCDLJObcGbsaForces1_12"
);
}
platforms/cuda/src/kernels/kCalculateLocalForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
extern
__shared__
Vectors
sV
[];
static
__constant__
cudaGmxSimulation
cSim
;
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{
\
dp
=
DOT3
(
v1
,
v2
);
\
float
norm1
=
DOT3
(
v1
,
v1
);
\
float
norm2
=
DOT3
(
v2
,
v2
);
\
dp
/=
sqrt
(
norm1
*
norm2
);
\
dp
=
min
(
dp
,
1.0
f
);
\
dp
=
max
(
dp
,
-
1.0
f
);
\
}
#define CROSS_PRODUCT(v1, v2, c) \
c
.
x
=
v1
.
y
*
v2
.
z
-
v1
.
z
*
v2
.
y
;
\
c
.
y
=
v1
.
z
*
v2
.
x
-
v1
.
x
*
v2
.
z
;
\
c
.
z
=
v1
.
x
*
v2
.
y
-
v1
.
y
*
v2
.
x
;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{
\
float
angle
=
acos
(
cosine
);
\
float
deltaIdeal
=
angle
-
(
param
.
x
*
(
3.14159265
f
/
180.0
f
));
\
dEdR
=
param
.
y
*
deltaIdeal
;
\
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{
\
float
dp
;
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
dp
);
\
angle
=
acos
(
dp
);
\
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{
\
GETNORMEDDOTPRODUCT
(
v1
,
v2
,
cosine
);
\
angle
=
acos
(
cosine
);
\
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{
\
CROSS_PRODUCT
(
vector1
,
vector2
,
cp0
);
\
CROSS_PRODUCT
(
vector2
,
vector3
,
cp1
);
\
GETANGLECOSINEBETWEENTWOVECTORS
(
cp0
,
cp1
,
angle
,
cosine
);
\
float
dp
=
DOT3
(
signVector
,
cp1
);
\
angle
=
(
dp
>=
0
)
?
angle
:
-
angle
;
\
}
void
SetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateLocalForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateLocalForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
Vectors
*
A
=
&
sV
[
threadIdx
.
x
];
while
(
pos
<
cSim
.
bond_offset
)
{
if
(
pos
<
cSim
.
bonds
)
{
int4
atom
=
cSim
.
pBondID
[
pos
];
float4
atomA
=
cSim
.
pPosq
[
atom
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom
.
y
];
float2
bond
=
cSim
.
pBondParameter
[
pos
];
float
dx
=
atomB
.
x
-
atomA
.
x
;
float
dy
=
atomB
.
y
-
atomA
.
y
;
float
dz
=
atomB
.
z
-
atomA
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
float
deltaIdeal
=
r
-
bond
.
x
;
float
dEdR
=
bond
.
y
*
deltaIdeal
;
dEdR
=
(
r
>
0.0
f
)
?
(
dEdR
/
r
)
:
0.0
f
;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
dx
*=
dEdR
;
dy
*=
dEdR
;
dz
*=
dEdR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
forceA
.
x
+=
dx
;
forceA
.
y
+=
dy
;
forceA
.
z
+=
dz
;
forceB
.
x
-=
dx
;
forceB
.
y
-=
dy
;
forceB
.
z
-=
dz
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
bond_angle_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_offset
;
if
(
pos1
<
cSim
.
bond_angles
)
{
int4
atom1
=
cSim
.
pBondAngleID1
[
pos1
];
float2
bond_angle
=
cSim
.
pBondAngleParameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom1
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom1
.
y
];
float4
a3
=
cSim
.
pPosq
[
atom1
.
z
];
A
->
v0
.
x
=
a2
.
x
-
a1
.
x
;
A
->
v0
.
y
=
a2
.
y
-
a1
.
y
;
A
->
v0
.
z
=
a2
.
z
-
a1
.
z
;
A
->
v1
.
x
=
a2
.
x
-
a3
.
x
;
A
->
v1
.
y
=
a2
.
y
-
a3
.
y
;
A
->
v1
.
z
=
a2
.
z
-
a3
.
z
;
float3
cp
;
CROSS_PRODUCT
(
A
->
v0
,
A
->
v1
,
cp
);
float
rp
=
DOT3
(
cp
,
cp
);
//cx * cx + cy * cy + cz * cz;
rp
=
max
(
sqrt
(
rp
),
1.0e-06
f
);
float
r21
=
DOT3
(
A
->
v0
,
A
->
v0
);
// dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float
r23
=
DOT3
(
A
->
v1
,
A
->
v1
);
// dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float
dot
=
DOT3
(
A
->
v0
,
A
->
v1
);
// dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float
cosine
=
dot
/
sqrt
(
r21
*
r23
);
float
dEdR
;
GETPREFACTORSGIVENANGLECOSINE
(
cosine
,
bond_angle
,
dEdR
);
//printf("%11.4f %11.4f\n", cosine, dEdR);
float
termA
=
dEdR
/
(
r21
*
rp
);
float
termC
=
-
dEdR
/
(
r23
*
rp
);
float3
c21
;
float3
c23
;
CROSS_PRODUCT
(
A
->
v0
,
cp
,
c21
);
CROSS_PRODUCT
(
A
->
v1
,
cp
,
c23
);
c21
.
x
*=
termA
;
c21
.
y
*=
termA
;
c21
.
z
*=
termA
;
c23
.
x
*=
termC
;
c23
.
y
*=
termC
;
c23
.
z
*=
termC
;
int2
atom2
=
cSim
.
pBondAngleID2
[
pos1
];
unsigned
int
offset
=
atom1
.
x
+
atom1
.
w
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom1
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c21
.
x
;
force
.
y
+=
c21
.
y
;
force
.
z
+=
c21
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
y
+
atom2
.
x
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
-=
(
c21
.
x
+
c23
.
x
);
force
.
y
-=
(
c21
.
y
+
c23
.
y
);
force
.
z
-=
(
c21
.
z
+
c23
.
z
);
cSim
.
pForce4
[
offset
]
=
force
;
offset
=
atom1
.
z
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
c23
.
x
;
force
.
y
+=
c23
.
y
;
force
.
z
+=
c23
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
bond_angle_offset
;
if
(
pos1
<
cSim
.
dihedrals
)
{
int4
atom1
=
cSim
.
pDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
;
GETDIHEDRALANGLEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
);
float4
dihedral
=
cSim
.
pDihedralParameter
[
pos1
];
float
deltaAngle
=
dihedral
.
z
*
dihedralAngle
-
(
dihedral
.
y
*
3.14159265
f
/
180.0
f
);
float
sinDeltaAngle
=
sin
(
deltaAngle
);
float
dEdAngle
=
-
dihedral
.
x
*
dihedral
.
z
*
sinDeltaAngle
;
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrt
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
z
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
rb_dihedral_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
dihedral_offset
;
if
(
pos1
<
cSim
.
rb_dihedrals
)
{
int4
atom1
=
cSim
.
pRbDihedralID1
[
pos1
];
float4
atomA
=
cSim
.
pPosq
[
atom1
.
x
];
float4
atomB
=
cSim
.
pPosq
[
atom1
.
y
];
float4
atomC
=
cSim
.
pPosq
[
atom1
.
z
];
float4
atomD
=
cSim
.
pPosq
[
atom1
.
w
];
A
->
v0
.
x
=
atomA
.
x
-
atomB
.
x
;
A
->
v0
.
y
=
atomA
.
y
-
atomB
.
y
;
A
->
v0
.
z
=
atomA
.
z
-
atomB
.
z
;
A
->
v1
.
x
=
atomC
.
x
-
atomB
.
x
;
A
->
v1
.
y
=
atomC
.
y
-
atomB
.
y
;
A
->
v1
.
z
=
atomC
.
z
-
atomB
.
z
;
A
->
v2
.
x
=
atomC
.
x
-
atomD
.
x
;
A
->
v2
.
y
=
atomC
.
y
-
atomD
.
y
;
A
->
v2
.
z
=
atomC
.
z
-
atomD
.
z
;
float3
cp0
,
cp1
;
float
dihedralAngle
,
cosPhi
;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS
(
A
->
v0
,
A
->
v1
,
A
->
v2
,
A
->
v0
,
cp0
,
cp1
,
dihedralAngle
,
cosPhi
);
if
(
dihedralAngle
<
0.0
f
)
{
dihedralAngle
+=
3.14159265
f
;
}
else
{
dihedralAngle
-=
3.14159265
f
;
}
cosPhi
=
-
cosPhi
;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4
dihedral1
=
cSim
.
pRbDihedralParameter1
[
pos1
];
float2
dihedral2
=
cSim
.
pRbDihedralParameter2
[
pos1
];
float
cosFactor
=
cosPhi
;
float
dEdAngle
=
-
dihedral1
.
y
;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle
-=
2.0
f
*
dihedral1
.
z
*
cosFactor
;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
3.0
f
*
dihedral1
.
w
*
cosFactor
;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
4.0
f
*
dihedral2
.
x
*
cosFactor
;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor
*=
cosPhi
;
dEdAngle
-=
5.0
f
*
dihedral2
.
y
*
cosFactor
;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle
*=
sin
(
dihedralAngle
);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float
normCross1
=
DOT3
(
cp0
,
cp0
);
float
normBC
=
sqrt
(
DOT3
(
A
->
v1
,
A
->
v1
));
float4
ff
;
ff
.
x
=
(
-
dEdAngle
*
normBC
)
/
normCross1
;
float
normCross2
=
DOT3
(
cp1
,
cp1
);
ff
.
w
=
(
dEdAngle
*
normBC
)
/
normCross2
;
float
dp
=
1.0
f
/
DOT3
(
A
->
v1
,
A
->
v1
);
ff
.
y
=
DOT3
(
A
->
v0
,
A
->
v1
)
*
dp
;
ff
.
z
=
DOT3
(
A
->
v2
,
A
->
v1
)
*
dp
;
int4
atom2
=
cSim
.
pRbDihedralID2
[
pos1
];
float3
internalF0
;
float3
internalF3
;
float3
s
;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned
int
offset
=
atom1
.
x
+
atom2
.
x
*
cSim
.
stride
;
float4
force
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom2
.
x
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF0
.
x
=
ff
.
x
*
cp0
.
x
;
force
.
x
+=
internalF0
.
x
;
internalF0
.
y
=
ff
.
x
*
cp0
.
y
;
force
.
y
+=
internalF0
.
y
;
internalF0
.
z
=
ff
.
x
*
cp0
.
z
;
force
.
z
+=
internalF0
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
w
+
atom2
.
w
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
w
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
internalF3
.
x
=
ff
.
w
*
cp1
.
x
;
force
.
x
+=
internalF3
.
x
;
internalF3
.
y
=
ff
.
w
*
cp1
.
y
;
force
.
y
+=
internalF3
.
y
;
internalF3
.
z
=
ff
.
w
*
cp1
.
z
;
force
.
z
+=
internalF3
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s
.
x
=
ff
.
y
*
internalF0
.
x
-
ff
.
z
*
internalF3
.
x
;
s
.
y
=
ff
.
y
*
internalF0
.
y
-
ff
.
z
*
internalF3
.
y
;
s
.
z
=
ff
.
y
*
internalF0
.
z
-
ff
.
z
*
internalF3
.
z
;
offset
=
atom1
.
y
+
atom2
.
y
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
y
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF0
.
x
+
s
.
x
;
force
.
y
+=
-
internalF0
.
y
+
s
.
y
;
force
.
z
+=
-
internalF0
.
z
+
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset
=
atom1
.
z
+
atom2
.
z
*
cSim
.
stride
;
force
.
x
=
force
.
y
=
force
.
z
=
0.0
f
;
if
(
atom2
.
z
<
cSim
.
totalNonbondOutputBuffers
)
force
=
cSim
.
pForce4
[
offset
];
force
.
x
+=
-
internalF3
.
x
-
s
.
x
;
force
.
y
+=
-
internalF3
.
y
-
s
.
y
;
force
.
z
+=
-
internalF3
.
z
-
s
.
z
;
cSim
.
pForce4
[
offset
]
=
force
;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
while
(
pos
<
cSim
.
LJ14_offset
)
{
unsigned
int
pos1
=
pos
-
cSim
.
rb_dihedral_offset
;
if
(
pos1
<
cSim
.
LJ14s
)
{
int4
atom
=
cSim
.
pLJ14ID
[
pos1
];
float4
LJ14
=
cSim
.
pLJ14Parameter
[
pos1
];
float4
a1
=
cSim
.
pPosq
[
atom
.
x
];
float4
a2
=
cSim
.
pPosq
[
atom
.
y
];
float3
d
;
d
.
x
=
a1
.
x
-
a2
.
x
;
d
.
y
=
a1
.
y
-
a2
.
y
;
d
.
z
=
a1
.
z
-
a2
.
z
;
float
r2
=
DOT3
(
d
,
d
);
float
inverseR
=
1.0
f
/
sqrt
(
r2
);
float
sig2
=
inverseR
*
LJ14
.
y
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
dEdR
=
LJ14
.
x
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
+=
LJ14
.
z
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
unsigned
int
offsetA
=
atom
.
x
+
atom
.
z
*
cSim
.
stride
;
unsigned
int
offsetB
=
atom
.
y
+
atom
.
w
*
cSim
.
stride
;
float4
forceA
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
z
<
cSim
.
totalNonbondOutputBuffers
)
forceA
=
cSim
.
pForce4
[
offsetA
];
float4
forceB
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atom
.
w
<
cSim
.
totalNonbondOutputBuffers
)
forceB
=
cSim
.
pForce4
[
offsetB
];
d
.
x
*=
dEdR
;
d
.
y
*=
dEdR
;
d
.
z
*=
dEdR
;
forceA
.
x
+=
d
.
x
;
forceA
.
y
+=
d
.
y
;
forceA
.
z
+=
d
.
z
;
forceB
.
x
-=
d
.
x
;
forceB
.
y
-=
d
.
y
;
forceB
.
z
-=
d
.
z
;
cSim
.
pForce4
[
offsetA
]
=
forceA
;
cSim
.
pForce4
[
offsetB
]
=
forceB
;
}
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kCalculateLocalForces
(
gpuContext
gpu
)
{
// printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
localForces_threads_per_block
,
gpu
->
sim
.
localForces_threads_per_block
*
sizeof
(
Vectors
)
>>>
();
LAUNCHERROR
(
"kCalculateLocalForces"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sum
;
float
junk
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaBornSumSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kReduceObcGbsaBornSum_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
;
float2
atom
=
cSim
.
pObcData
[
pos
];
// Get summed Born data
for
(
int
i
=
0
;
i
<
cSim
.
nonbondOutputBuffers
;
i
++
)
{
sum
+=
*
pSt
;
// printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
pSt
+=
cSim
.
stride
;
}
// Now calculate Born radius and OBC term.
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kReduceObcGbsaBornSum\n");
kReduceObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
gpu
->
bRecalculateBornRadii
=
false
;
if
(
0
){
static
int
step
=
0
;
int
numPrint
=
-
1
;
step
++
;
WriteArrayToFile1
(
gpu
,
"ObcGbsaBornBRad"
,
step
,
gpu
->
psBornRadii
,
numPrint
);
WriteArrayToFile1
(
gpu
,
"ObcGbsaBornSum"
,
step
,
gpu
->
psBornSum
,
numPrint
);
WriteArrayToFile2
(
gpu
,
"ObcGbsaObcData"
,
step
,
gpu
->
psObcData
,
numPrint
);
WriteArrayToFile4
(
gpu
,
"ObcGbsaBornPos"
,
step
,
gpu
->
psPosq4
,
numPrint
);
//gpuDumpCoordinates( gpu );
gpuDumpObcInfo
(
gpu
);
}
LAUNCHERROR
(
"kReduceObcGbsaBornSum"
);
}
__global__
void
kCalculateObcGbsaBornSum_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
(
blockIdx
.
x
*
cSim
.
workUnits
)
/
gridDim
.
x
;
int
end
=
((
blockIdx
.
x
+
1
)
*
cSim
.
workUnits
)
/
gridDim
.
x
;
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
-
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float
dx
;
float
dy
;
float
dz
;
float
r2
;
float
r
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
r
=
ar
.
x
;
sA
[
threadIdx
.
x
].
sr
=
ar
.
y
;
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
j
].
x
-
apos
.
x
;
dy
=
psA
[
j
].
y
-
apos
.
y
;
dz
=
psA
[
j
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
if
((
j
!=
tgx
)
&&
(
ar
.
x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
apos
.
w
+=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
j
].
sr
*
psA
[
j
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
j
].
r
-
r
))
{
apos
.
w
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
}
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
float4
apos
=
cSim
.
pPosq
[
i
];
// Local atom x, y, z, sum
float2
ar
=
cSim
.
pObcData
[
i
];
// Local atom vr, sr
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sum
=
apos
.
w
=
0.0
f
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
dx
=
psA
[
tj
].
x
-
apos
.
x
;
dy
=
psA
[
tj
].
y
-
apos
.
y
;
dz
=
psA
[
tj
].
z
-
apos
.
z
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r
=
sqrt
(
r2
);
float
rInverse
=
1.0
f
/
r
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
if
(
ar
.
x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0
f
/
max
(
ar
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
psA
[
tj
].
sr
*
psA
[
tj
].
sr
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
ar
.
x
<
(
psA
[
tj
].
sr
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
ar
.
x
)
-
l_ij
);
}
apos
.
w
+=
term
;
}
float
rScaledRadiusI
=
r
+
ar
.
y
;
if
(
psA
[
tj
].
r
<
rScaledRadiusI
)
{
float
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
ar
.
y
));
float
u_ij
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
);
float
term
=
l_ij
-
u_ij
+
0.25
f
*
r
*
(
u_ij2
-
l_ij2
)
+
(
0.50
f
*
rInverse
*
ratio
)
+
(
0.25
f
*
ar
.
y
*
ar
.
y
*
rInverse
)
*
(
l_ij2
-
u_ij2
);
if
(
psA
[
tj
].
r
<
(
ar
.
y
-
r
))
{
term
+=
2.0
f
*
((
1.0
f
/
psA
[
tj
].
r
)
-
l_ij
);
}
psA
[
tj
].
sum
+=
term
;
}
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
apos
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pBornSum
[
offset
]
=
sA
[
threadIdx
.
x
].
sum
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaBornSum
(
gpuContext
gpu
)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateBornSum"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
G8X_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces1Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kReduceObcGbsaBornForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
float2
obcData
=
cSim
.
pObcData
[
pos
];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
// if (bornRadius > 0.0f)
// {
float
r
=
(
obcData
.
x
+
cSim
.
dielectricOffset
+
cSim
.
probeRadius
);
float
ratio6
=
pow
((
obcData
.
x
+
cSim
.
dielectricOffset
)
/
bornRadius
,
6.0
f
);
//float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
float
saTerm
=
cSim
.
surfaceAreaFactor
*
r
*
r
*
ratio6
;
totalForce
+=
saTerm
/
bornRadius
;
// 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
pFt
=
cSim
.
pBornForce
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kReduceObcGbsaBornForces1_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
//float2 obcData = cSim.pObcData[pos];
float
totalForce
=
0.0
f
;
float
*
pFt
=
cSim
.
pBornForce
+
pos
;
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
/*
// if (bornRadius > 0.0f)
// {
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
*/
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
cSim
.
pBornForce
[
pos
]
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
__global__
void
kAceGbsa_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
while
(
pos
<
cSim
.
atoms
)
{
float
bornRadius
=
cSim
.
pBornRadii
[
pos
];
float
obcChain
=
cSim
.
pObcChain
[
pos
];
float2
obcData
=
cSim
.
pObcData
[
pos
];
float
totalForce
=
cSim
.
pBornForce
[
pos
];
//float totalForce = 0.0f;
float
r
=
(
obcData
.
x
+
cSim
.
dielectricOffset
+
cSim
.
probeRadius
);
float
ratio6
=
pow
((
obcData
.
x
+
cSim
.
dielectricOffset
)
/
bornRadius
,
6.0
f
);
/*
float ratio6 = (obcData.x + cSim.dielectricOffset) / bornRadius;
ratio6 = ratio6*ratio6;
ratio6 = ratio6*ratio6*ratio6;
*/
//float saTerm = 41.84f*cSim.surfaceAreaFactor * r * r * ratio6;
float
saTerm
=
cSim
.
surfaceAreaFactor
*
r
*
r
*
ratio6
;
totalForce
+=
saTerm
/
bornRadius
;
// 1.102 == Temp mysterious fudge factor, FIX FIX FIX
totalForce
*=
bornRadius
*
bornRadius
*
obcChain
;
cSim
.
pBornForce
[
pos
]
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceObcGbsaBornForces
(
gpuContext
gpu
)
{
//printf("kReduceObcGbsaBornForces QQ\n");
kReduceObcGbsaBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bf_reduce_threads_per_block
>>>
();
//kReduceObcGbsaBornForces1_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//kAceGbsa_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//printf("kReduceObcGbsaBornForces calling gpuDumpObcLoop1 QQ\n");
//gpuDumpObcLoop1(gpu);
}
__global__
void
kCalculateObcGbsaForces1_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float4
af
;
// Local atom fx, fy, fz, fb
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
br
=
br
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float
temp1
=
cSim
.
pBornRadii
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
br
=
temp1
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
af
.
w
=
sA
[
threadIdx
.
x
].
fb
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces1_12_kernel
();
void
kCalculateObcGbsaForces1
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces1 version=%d sm_12=%d QQ\n", gpu->sm_version, SM_12);
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateObcGbsaForces1_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
else
kCalculateObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForce1"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
q
;
float
br
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
};
__shared__
Atom
sA
[
GT2XX_NONBOND_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces1_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces1_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
nbWorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
nbWorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
nbWorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
nbWorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
float4
apos
;
// Local atom x, y, z, q
float4
af
;
// Local atom fx, fy, fz, fb
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned
int
i
=
x
+
tgx
;
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
sA
[
threadIdx
.
x
].
q
=
apos
.
w
;
sA
[
threadIdx
.
x
].
br
=
br
;
af
.
x
=
0.0
f
;
af
.
y
=
0.0
f
;
af
.
z
=
0.0
f
;
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
j
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
j
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
j
].
br
;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
unsigned
int
i
=
x
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float
temp1
=
cSim
.
pBornRadii
[
j
];
apos
=
cSim
.
pPosq
[
i
];
float
br
=
cSim
.
pBornRadii
[
i
];
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
q
=
temp
.
w
;
sA
[
threadIdx
.
x
].
br
=
temp1
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
sA
[
threadIdx
.
x
].
fb
=
af
.
w
=
0.0
f
;
apos
.
w
*=
cSim
.
preFactor
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
alpha2_ij
=
br
*
psA
[
tj
].
br
;
float
D_ij
=
r2
/
(
4.0
f
*
alpha2_ij
);
float
expTerm
=
exp
(
-
D_ij
);
float
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
float
denominator
=
sqrt
(
denominator2
);
float
Gpol
=
(
apos
.
w
*
psA
[
tj
].
q
)
/
(
denominator
*
denominator2
);
float
dGpol_dr
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
float
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
dx
*=
dGpol_dr
;
dy
*=
dGpol_dr
;
dz
*=
dGpol_dr
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
w
+=
dGpol_dalpha2_ij
*
psA
[
tj
].
br
;
psA
[
tj
].
fb
+=
dGpol_dalpha2_ij
*
br
;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
af
.
x
=
sA
[
threadIdx
.
x
].
fx
;
af
.
y
=
sA
[
threadIdx
.
x
].
fy
;
af
.
z
=
sA
[
threadIdx
.
x
].
fz
;
af
.
w
=
sA
[
threadIdx
.
x
].
fb
;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
cSim
.
pForce4a
[
offset
]
=
af
;
cSim
.
pBornForce
[
offset
]
=
af
.
w
;
}
pos
-=
cSim
.
nonbond_workBlock
;
}
}
void
kCalculateObcGbsaForces1_12
(
gpuContext
gpu
)
{
// printf("kCalculateObcGbsaForces1_12\n");
kCalculateObcGbsaForces1_12_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForce1_12"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#include "cudaKernels.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sr2
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
// float sum;
// float oneOverR;
int
pos
;
int
wx
;
int
wy
;
};
__shared__
Atom
sA
[
G8X_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
G8X_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces2_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
sA
[
threadIdx
.
x
].
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
sA
[
threadIdx
.
x
].
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
sA
[
threadIdx
.
x
].
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
sA
[
threadIdx
.
x
].
wx
=
x
;
sA
[
threadIdx
.
x
].
wy
=
y
;
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born Forces and sum
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born Forces and sum
float
rScaledRadiusI
=
r
+
a
.
y
;
l_ij
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
u_ij
=
1.0
f
/
rScaledRadiusI
;
l_ij2
=
l_ij
*
l_ij
;
u_ij2
=
u_ij
*
u_ij
;
t1
=
log
(
u_ij
/
l_ij
);
t2
=
(
l_ij2
-
u_ij2
);
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
term
=
0.125
f
*
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
sA
[
threadIdx
.
x
].
wx
+
tgx
+
(
sA
[
threadIdx
.
x
].
wy
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
sA
[
threadIdx
.
x
].
wy
+
tgx
+
(
sA
[
threadIdx
.
x
].
wx
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA
[
threadIdx
.
x
].
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
__global__
extern
void
kCalculateObcGbsaForces2_12_kernel
();
void
kCalculateObcGbsaForces2
(
gpuContext
gpu
)
{
//printf("kCalculateObcGbsaForces2\n");
if
(
gpu
->
sm_version
<
SM_12
)
kCalculateObcGbsaForces2_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
else
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
if
(
0
){
static
int
step
=
0
;
//int numPrint = -1;
step
++
;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces
(
gpu
);
gpuDumpObcLoop1
(
gpu
);
}
LAUNCHERROR
(
"kCalculateObcGbsaForces2"
);
}
platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
struct
Atom
{
float
x
;
float
y
;
float
z
;
float
r
;
float
sr
;
float
sr2
;
float
fx
;
float
fy
;
float
fz
;
float
fb
;
// float sum;
};
__shared__
Atom
sA
[
GT2XX_BORNFORCE2_THREADS_PER_BLOCK
];
__shared__
unsigned
int
sWorkUnit
[
GT2XX_NONBOND_WORKUNITS_PER_SM
];
__shared__
unsigned
int
sNext
[
GRID
];
static
__constant__
cudaGmxSimulation
cSim
;
void
SetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetCalculateObcGbsaForces2_12Sim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kCalculateObcGbsaForces2_12_kernel
()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int
pos
=
cSim
.
bf2WorkUnitsPerBlock
*
blockIdx
.
x
+
min
(
blockIdx
.
x
,
cSim
.
bf2WorkUnitsPerBlockRemainder
);
int
end
=
cSim
.
bf2WorkUnitsPerBlock
*
(
blockIdx
.
x
+
1
)
+
min
((
blockIdx
.
x
+
1
),
cSim
.
bf2WorkUnitsPerBlockRemainder
);
if
(
threadIdx
.
x
<
end
-
pos
)
{
sWorkUnit
[
threadIdx
.
x
]
=
cSim
.
pWorkUnit
[
pos
+
threadIdx
.
x
];
}
if
(
threadIdx
.
x
<
GRID
)
{
sNext
[
threadIdx
.
x
]
=
(
threadIdx
.
x
+
1
)
&
(
GRID
-
1
);
}
__syncthreads
();
// Now change pos and end to reflect work queue just read
// into shared memory
end
=
end
-
pos
;
pos
=
end
-
(
threadIdx
.
x
>>
GRIDBITS
)
-
1
;
while
(
pos
>=
0
)
{
// Extract cell coordinates from appropriate work unit
unsigned
int
x
=
sWorkUnit
[
pos
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
x
=
(
x
>>
17
)
<<
GRIDBITS
;
unsigned
int
tgx
=
threadIdx
.
x
&
(
GRID
-
1
);
unsigned
int
i
=
x
+
tgx
;
float4
apos
=
cSim
.
pPosq
[
i
];
float2
a
=
cSim
.
pObcData
[
i
];
float
fb
=
cSim
.
pBornForce
[
i
];
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
int
tj
=
tgx
;
Atom
*
psA
=
&
sA
[
tbx
];
if
(
x
==
y
)
// Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// float sum = 0.0f;
sA
[
threadIdx
.
x
].
x
=
apos
.
x
;
sA
[
threadIdx
.
x
].
y
=
apos
.
y
;
sA
[
threadIdx
.
x
].
z
=
apos
.
z
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
r
=
a
.
x
;
sA
[
threadIdx
.
x
].
sr
=
a
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
a
.
y
*
a
.
y
;
sA
[
threadIdx
.
x
].
fb
=
fb
;
for
(
unsigned
int
j
=
sNext
[
tgx
];
j
!=
tgx
;
j
=
sNext
[
j
])
{
float
dx
=
psA
[
j
].
x
-
apos
.
x
;
float
dy
=
psA
[
j
].
y
-
apos
.
y
;
float
dz
=
psA
[
j
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Atom I Born forces and sum
float
rScaledRadiusJ
=
r
+
psA
[
j
].
sr
;
float
l_ij
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
j
].
sr
));
float
u_ij
=
1.0
f
/
rScaledRadiusJ
;
float
rInverse
=
1.0
f
/
r
;
float
l_ij2
=
l_ij
*
l_ij
;
float
u_ij2
=
u_ij
*
u_ij
;
float
r2Inverse
=
rInverse
*
rInverse
;
float
t1
=
log
(
u_ij
/
l_ij
);
float
t2
=
(
l_ij2
-
u_ij2
);
float
t3
=
t2
*
rInverse
;
t1
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
j
].
sr2
*
r2Inverse
)
*
t3
+
0.250
f
*
t1
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
j
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
j
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
j
].
fz
+=
d
;
// sum += term;
}
// Write results
int
offset
=
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
+
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
af
.
y
+
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
af
.
z
+
sA
[
threadIdx
.
x
].
fz
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int
j
=
y
+
tgx
;
float4
temp
=
cSim
.
pPosq
[
j
];
float2
temp1
=
cSim
.
pObcData
[
j
];
sA
[
threadIdx
.
x
].
fb
=
cSim
.
pBornForce
[
j
];
float3
af
;
sA
[
threadIdx
.
x
].
fx
=
af
.
x
=
0.0
f
;
sA
[
threadIdx
.
x
].
fy
=
af
.
y
=
0.0
f
;
sA
[
threadIdx
.
x
].
fz
=
af
.
z
=
0.0
f
;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float
sr2
=
a
.
y
*
a
.
y
;
// float oneOverR = 1.0f / a.x;
sA
[
threadIdx
.
x
].
x
=
temp
.
x
;
sA
[
threadIdx
.
x
].
y
=
temp
.
y
;
sA
[
threadIdx
.
x
].
z
=
temp
.
z
;
sA
[
threadIdx
.
x
].
r
=
temp1
.
x
;
sA
[
threadIdx
.
x
].
sr
=
temp1
.
y
;
sA
[
threadIdx
.
x
].
sr2
=
temp1
.
y
*
temp1
.
y
;
for
(
j
=
0
;
j
<
GRID
;
j
++
)
{
float
dx
=
psA
[
tj
].
x
-
apos
.
x
;
float
dy
=
psA
[
tj
].
y
-
apos
.
y
;
float
dz
=
psA
[
tj
].
z
-
apos
.
z
;
float
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
float
r
=
sqrt
(
r2
);
// Interleaved Atom I and J Born Forces and sum components
float
r2Inverse
=
1.0
f
/
r2
;
float
rScaledRadiusJ
=
r
+
psA
[
tj
].
sr
;
float
rScaledRadiusI
=
r
+
a
.
y
;
float
rInverse
=
1.0
f
/
r
;
float
l_ijJ
=
1.0
f
/
max
(
a
.
x
,
fabs
(
r
-
psA
[
tj
].
sr
));
float
l_ijI
=
1.0
f
/
max
(
psA
[
tj
].
r
,
fabs
(
r
-
a
.
y
));
float
u_ijJ
=
1.0
f
/
rScaledRadiusJ
;
float
u_ijI
=
1.0
f
/
rScaledRadiusI
;
float
l_ij2J
=
l_ijJ
*
l_ijJ
;
float
l_ij2I
=
l_ijI
*
l_ijI
;
float
u_ij2J
=
u_ijJ
*
u_ijJ
;
float
u_ij2I
=
u_ijI
*
u_ijI
;
float
t1J
=
log
(
u_ijJ
/
l_ijJ
);
float
t1I
=
log
(
u_ijI
/
l_ijI
);
float
t2J
=
(
l_ij2J
-
u_ij2J
);
float
t2I
=
(
l_ij2I
-
u_ij2I
);
float
t3J
=
t2J
*
rInverse
;
float
t3I
=
t2I
*
rInverse
;
t1J
*=
rInverse
;
t1I
*=
rInverse
;
// Born Forces term
float
term
=
0.125
f
*
(
1.000
f
+
psA
[
tj
].
sr2
*
r2Inverse
)
*
t3J
+
0.250
f
*
t1J
*
r2Inverse
;
float
dE
=
fb
*
term
;
// Atom I Born sum term
// term = l_ijJ - u_ijJ +
// -0.25f * r * t2J +
// 0.50f * t1J +
// (0.25f * psA[tj].sr2) * t3J;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * (oneOverR - l_ijJ);
// }
if
(
a
.
x
>=
rScaledRadiusJ
)
{
dE
=
/*term =*/
0.0
f
;
}
float
d
=
dx
*
dE
;
af
.
x
-=
d
;
psA
[
tj
].
fx
+=
d
;
d
=
dy
*
dE
;
af
.
y
-=
d
;
psA
[
tj
].
fy
+=
d
;
d
=
dz
*
dE
;
af
.
z
-=
d
;
psA
[
tj
].
fz
+=
d
;
// sum += term;
// Atom J Born sum term
term
=
0.125
f
*
(
1.000
f
+
sr2
*
r2Inverse
)
*
t3I
+
0.250
f
*
t1I
*
r2Inverse
;
dE
=
psA
[
tj
].
fb
*
term
;
// term = l_ijI - u_ijI +
// -0.25f * r * t2I +
// 0.50f * t1I +
// (0.25f * sr2) * t3I;
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
// }
if
(
psA
[
tj
].
r
>=
rScaledRadiusI
)
{
dE
=
/*term =*/
0.0
f
;
}
dx
*=
dE
;
dy
*=
dE
;
dz
*=
dE
;
psA
[
tj
].
fx
+=
dx
;
psA
[
tj
].
fy
+=
dy
;
psA
[
tj
].
fz
+=
dz
;
af
.
x
-=
dx
;
af
.
y
-=
dy
;
af
.
z
-=
dz
;
// psA[tj].sum += term;
tj
=
sNext
[
tj
];
}
// Write results
int
offset
=
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cSim
.
stride
;
float4
of
;
of
.
x
=
af
.
x
;
of
.
y
=
af
.
y
;
of
.
z
=
af
.
z
;
of
.
w
=
0.0
f
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sum;
offset
=
y
+
tgx
+
(
x
>>
GRIDBITS
)
*
cSim
.
stride
;
of
.
x
=
sA
[
threadIdx
.
x
].
fx
;
of
.
y
=
sA
[
threadIdx
.
x
].
fy
;
of
.
z
=
sA
[
threadIdx
.
x
].
fz
;
cSim
.
pForce4b
[
offset
]
=
of
;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos
-=
cSim
.
bornForce2_workBlock
;
}
}
void
kCalculateObcGbsaForces2_12
(
gpuContext
gpu
)
{
// printf("kCalculateObcGbsaForces2_12\n");
kCalculateObcGbsaForces2_12_kernel
<<<
gpu
->
sim
.
bornForce2_blocks
,
gpu
->
sim
.
bornForce2_threads_per_block
>>>
();
LAUNCHERROR
(
"kCalculateObcGbsaForces2_12"
);
}
platforms/cuda/src/kernels/kForces.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define FABS(a) ((a) > 0.0f ? (a) : -(a))
static
__constant__
cudaGmxSimulation
cSim
;
void
SetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetForcesSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kClearForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride4
*
cSim
.
outputBuffers
)
{
((
float
*
)
cSim
.
pForce4
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearForces
(
gpuContext
gpu
)
{
// printf("kClearForces\n");
kClearForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearForces"
);
}
__global__
void
kClearBornForces_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
pos
<
cSim
.
stride
*
cSim
.
nonbondOutputBuffers
)
{
((
float
*
)
cSim
.
pBornForce
)[
pos
]
=
0.0
f
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kClearBornForces
(
gpuContext
gpu
)
{
// printf("kClearBornForces\n");
kClearBornForces_kernel
<<<
gpu
->
sim
.
blocks
,
384
>>>
();
LAUNCHERROR
(
"kClearBornForces"
);
}
__global__
void
kReduceBornSumAndForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
// Reduce Born Sum
while
(
pos
-
cSim
.
stride4
<
cSim
.
atoms
)
{
float
sum
=
0.0
f
;
float
*
pSt
=
cSim
.
pBornSum
+
pos
-
cSim
.
stride4
;
float2
atom
=
cSim
.
pObcData
[
pos
-
cSim
.
stride4
];
// Get summed Born data
int
i
=
cSim
.
nonbondOutputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f3
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f4
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pSt
;
pSt
+=
cSim
.
stride
;
float
f2
=
*
pSt
;
pSt
+=
cSim
.
stride
;
sum
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
sum
+=
*
pSt
;
}
// Now calculate Born radius and OBC term.
cSim
.
pBornSum
[
pos
-
cSim
.
stride4
]
=
sum
;
sum
*=
0.5
f
*
atom
.
x
;
float
sum2
=
sum
*
sum
;
float
sum3
=
sum
*
sum2
;
float
tanhSum
=
tanh
(
cSim
.
alphaOBC
*
sum
-
cSim
.
betaOBC
*
sum2
+
cSim
.
gammaOBC
*
sum3
);
float
nonOffsetRadii
=
atom
.
x
+
cSim
.
dielectricOffset
;
float
bornRadius
=
1.0
f
/
(
1.0
f
/
atom
.
x
-
tanhSum
/
nonOffsetRadii
);
float
obcChain
=
atom
.
x
*
(
cSim
.
alphaOBC
-
2.0
f
*
cSim
.
betaOBC
*
sum
+
3.0
f
*
cSim
.
gammaOBC
*
sum2
);
obcChain
=
(
1.0
f
-
tanhSum
*
tanhSum
)
*
obcChain
/
nonOffsetRadii
;
cSim
.
pBornRadii
[
pos
-
cSim
.
stride4
]
=
bornRadius
;
cSim
.
pObcChain
[
pos
-
cSim
.
stride4
]
=
obcChain
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceBornSumAndForces
(
gpuContext
gpu
)
{
//printf("kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceBornSumAndForces"
);
#if 0
//gpuDumpObcLoop1( gpu );
/*
gpu->psForce4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %12.6f %12.6f %12.6f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z
);
} */
#endif
}
__global__
void
kReduceForces_kernel
()
{
unsigned
int
pos
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// Reduce forces
while
(
pos
<
cSim
.
stride4
)
{
float
totalForce
=
0.0
f
;
float
*
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
int
i
=
cSim
.
outputBuffers
;
while
(
i
>=
4
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f3
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f4
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
+
f3
+
f4
;
i
-=
4
;
}
if
(
i
>=
2
)
{
float
f1
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
float
f2
=
*
pFt
;
pFt
+=
cSim
.
stride4
;
totalForce
+=
f1
+
f2
;
i
-=
2
;
}
if
(
i
>
0
)
{
totalForce
+=
*
pFt
;
}
pFt
=
(
float
*
)
cSim
.
pForce4
+
pos
;
*
pFt
=
totalForce
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
}
void
kReduceForces
(
gpuContext
gpu
)
{
// printf("kReduceForces\n");
kReduceForces_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
bsf_reduce_threads_per_block
>>>
();
LAUNCHERROR
(
"kReduceForces"
);
}
platforms/cuda/src/kernels/kRandom.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using
namespace
std
;
#include "gputypes.h"
static
__constant__
cudaGmxSimulation
cSim
;
void
SetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetRandomSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
extern
__shared__
float3
sRand
[];
__global__
void
kGenerateRandoms_kernel
()
{
unsigned
int
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
increment
=
blockDim
.
x
*
gridDim
.
x
;
// Read generator state
uint4
state
=
cSim
.
pRandomSeed
[
pos
];
unsigned
int
carry
=
0
;
float4
random4
;
float2
random2
;
while
(
pos
<
cSim
.
totalRandomsTimesTwo
)
{
// Generate 6 randoms in GRF
unsigned
int
pos1
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
unsigned
int
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
unsigned
int
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x1
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x3
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
sRand
[
pos1
].
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x5
=
(
float
)
max
(
state
.
x
+
state
.
y
+
state
.
w
,
0x00000001
)
/
(
float
)
0xffffffff
;
state
.
x
=
state
.
x
*
69069
+
1
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
sRand
[
pos1
].
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
pos1
+=
blockDim
.
x
;
}
// Output final randoms
float
c1
,
c2
;
if
(
pos
<
cSim
.
totalRandoms
)
{
c1
=
cSim
.
Yv
;
c2
=
cSim
.
V
;
}
else
{
c1
=
cSim
.
Yx
;
c2
=
cSim
.
X
;
}
random4
.
x
=
c1
*
sRand
[
threadIdx
.
x
].
x
;
random4
.
y
=
c1
*
sRand
[
threadIdx
.
x
].
y
;
random4
.
z
=
c1
*
sRand
[
threadIdx
.
x
].
z
;
random4
.
w
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
x
;
cSim
.
pRandom4a
[
pos
]
=
random4
;
random2
.
x
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
y
;
random2
.
y
=
c2
*
sRand
[
threadIdx
.
x
+
blockDim
.
x
].
z
;
cSim
.
pRandom2a
[
pos
]
=
random2
;
pos
+=
increment
;
}
// Write generator state
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
cSim
.
pRandomSeed
[
pos
]
=
state
;
}
void
kGenerateRandoms
(
gpuContext
gpu
)
{
kGenerateRandoms_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
random_threads_per_block
,
gpu
->
sim
.
random_threads_per_block
*
2
*
sizeof
(
float3
)
>>>
();
}
\ No newline at end of file
platforms/cuda/src/kernels/kUpdateShakeH.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#define DeltaShake
#include "gputypes.h"
struct
Atom
{
float3
rij1
;
float3
rij2
;
float3
rij3
;
float
M
;
float
d2
;
float
InvMassI
;
float
rij1sq
;
float
rij2sq
;
float
rij3sq
;
};
static
__constant__
cudaGmxSimulation
cSim
;
void
SetUpdateShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetUpdateShakeHSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
xVector
=
cSim
.
pxVector4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float2
random2a
=
cSim
.
pRandom2a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float3
Vmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
Vmh
.
x
=
xVector
.
x
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
x
;
Vmh
.
y
=
xVector
.
y
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
y
;
Vmh
.
z
=
xVector
.
z
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
z
;
float4
vVector
;
vVector
.
x
=
sqrtInvMass
*
random4a
.
w
;
vVector
.
y
=
sqrtInvMass
*
random2a
.
x
;
vVector
.
z
=
sqrtInvMass
*
random2a
.
y
;
vVector
.
w
=
0.0
f
;
cSim
.
pvVector4
[
pos
]
=
vVector
;
velocity
.
x
=
velocity
.
x
*
cSim
.
EM
+
velocity
.
w
*
force
.
x
*
cSim
.
TauOneMinusEM
+
vVector
.
x
-
cSim
.
EM
*
Vmh
.
x
;
velocity
.
y
=
velocity
.
y
*
cSim
.
EM
+
velocity
.
w
*
force
.
y
*
cSim
.
TauOneMinusEM
+
vVector
.
y
-
cSim
.
EM
*
Vmh
.
y
;
velocity
.
z
=
velocity
.
z
*
cSim
.
EM
+
velocity
.
w
*
force
.
z
*
cSim
.
TauOneMinusEM
+
vVector
.
z
-
cSim
.
EM
*
Vmh
.
z
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
+=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
+=
velocity
.
z
*
cSim
.
fix1
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
=
velocity
.
z
*
cSim
.
fix1
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kUpdatePart1CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float4
CM1
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
#if 0
float4 CM2 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM3 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM4 = { 0.0f, 0.0f, 0.0f, 0.0f };
if (cpos < gridDim.x)
CM1 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM2 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM3 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM4 = cSim.pLinearMomentum[cpos];
sCM[threadIdx.x].x = CM1.x + CM2.x + CM3.x + CM4.x;
sCM[threadIdx.x].y = CM1.y + CM2.y + CM3.y + CM4.y;
sCM[threadIdx.x].z = CM1.z + CM2.z + CM3.z + CM4.z;
#else
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
#endif
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
xVector
=
cSim
.
pxVector4
[
pos
];
float4
random4a
=
cSim
.
pRandom4a
[
rpos
+
pos
];
float2
random2a
=
cSim
.
pRandom2a
[
rpos
+
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float3
Vmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
Vmh
.
x
=
xVector
.
x
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
x
;
Vmh
.
y
=
xVector
.
y
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
y
;
Vmh
.
z
=
xVector
.
z
*
cSim
.
DOverTauC
+
sqrtInvMass
*
random4a
.
z
;
float4
vVector
;
vVector
.
x
=
sqrtInvMass
*
random4a
.
w
;
vVector
.
y
=
sqrtInvMass
*
random2a
.
x
;
vVector
.
z
=
sqrtInvMass
*
random2a
.
y
;
vVector
.
w
=
0.0
f
;
cSim
.
pvVector4
[
pos
]
=
vVector
;
velocity
.
x
=
velocity
.
x
*
cSim
.
EM
+
velocity
.
w
*
force
.
x
*
cSim
.
TauOneMinusEM
+
vVector
.
x
-
cSim
.
EM
*
Vmh
.
x
-
sCM
[
0
].
x
;
velocity
.
y
=
velocity
.
y
*
cSim
.
EM
+
velocity
.
w
*
force
.
y
*
cSim
.
TauOneMinusEM
+
vVector
.
y
-
cSim
.
EM
*
Vmh
.
y
-
sCM
[
0
].
y
;
velocity
.
z
=
velocity
.
z
*
cSim
.
EM
+
velocity
.
w
*
force
.
z
*
cSim
.
TauOneMinusEM
+
vVector
.
z
-
cSim
.
EM
*
Vmh
.
z
-
sCM
[
0
].
z
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
+=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
+=
velocity
.
z
*
cSim
.
fix1
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
fix1
;
apos
.
y
=
velocity
.
y
*
cSim
.
fix1
;
apos
.
z
=
velocity
.
z
*
cSim
.
fix1
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kUpdatePart1
(
gpuContext
gpu
)
{
// printf("kUpdatePart1\n");
#if 0
static int iteration = 0;
if (iteration == 0)
{
gpu->psPosq4->Download();
gpu->psVelm4->Download();
printf("# %d atoms\n", gpu->natoms);
for (int i = 0; i < gpu->natoms; i++)
{
printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
);
}
}
iteration++;
#endif
#if 0
static const float KILO = 1e3; // Thousand
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // ()
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K))
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K))
static int iteration = 0;
// Check T
if (iteration % 1000 == 0)
{
gpu->psVelm4->Download();
float ke = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
float vx = gpu->psVelm4->_pSysStream[0][i].x;
float vy = gpu->psVelm4->_pSysStream[0][i].y;
float vz = gpu->psVelm4->_pSysStream[0][i].z;
float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
ke += m * (vx * vx + vy * vy + vz * vz);
}
float T = ke / (BOLTZ * gpu->sim.degreesOfFreedom);
printf("Iteration %d, Temperature is %f\n", iteration, T);
}
iteration++;
#endif
if
(
gpu
->
bRemoveCM
)
{
kUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kUpdatePart1"
);
}
}
__global__
void
kApplyFirstShake_kernel
()
{
__shared__
Atom
sA
[
G8X_THREADS_PER_BLOCK
];
Atom
*
psA
=
&
sA
[
threadIdx
.
x
];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ShakeConstraints
)
{
int4
atomID
=
cSim
.
pShakeID
[
pos
];
float4
params
=
cSim
.
pShakeParameter
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xpi
=
cSim
.
pPosqP
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xpj1
=
cSim
.
pPosqP
[
atomID
.
y
];
float4
apos2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
psA
->
InvMassI
=
params
.
x
;
psA
->
M
=
params
.
y
;
psA
->
d2
=
params
.
z
;
float
invMassJ
=
params
.
w
;
if
(
atomID
.
z
!=
-
1
)
{
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
xpj2
=
cSim
.
pPosqP
[
atomID
.
z
];
}
float4
apos3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atomID
.
w
!=
-
1
)
{
apos3
=
cSim
.
pOldPosq
[
atomID
.
w
];
xpj3
=
cSim
.
pPosqP
[
atomID
.
w
];
}
float3
xi
,
xj1
,
xj2
,
xj3
;
xi
.
x
=
apos
.
x
;
xi
.
y
=
apos
.
y
;
xi
.
z
=
apos
.
z
;
xj1
.
x
=
apos1
.
x
;
xj1
.
y
=
apos1
.
y
;
xj1
.
z
=
apos1
.
z
;
xj2
.
x
=
apos2
.
x
;
xj2
.
y
=
apos2
.
y
;
xj2
.
z
=
apos2
.
z
;
xj3
.
x
=
apos3
.
x
;
xj3
.
y
=
apos3
.
y
;
xj3
.
z
=
apos3
.
z
;
#ifndef DeltaShake
xpi
.
x
-=
xi
.
x
;
xpi
.
y
-=
xi
.
y
;
xpi
.
z
-=
xi
.
z
;
xpj1
.
x
-=
xj1
.
x
;
xpj1
.
y
-=
xj1
.
y
;
xpj1
.
z
-=
xj1
.
z
;
xpj2
.
x
-=
xj2
.
x
;
xpj2
.
y
-=
xj2
.
y
;
xpj2
.
z
-=
xj2
.
z
;
xpj3
.
x
-=
xj3
.
x
;
xpj3
.
y
-=
xj3
.
y
;
xpj3
.
z
-=
xj3
.
z
;
#endif
psA
->
rij1
.
x
=
xi
.
x
-
xj1
.
x
;
psA
->
rij1
.
y
=
xi
.
y
-
xj1
.
y
;
psA
->
rij1
.
z
=
xi
.
z
-
xj1
.
z
;
psA
->
rij2
.
x
=
xi
.
x
-
xj2
.
x
;
psA
->
rij2
.
y
=
xi
.
y
-
xj2
.
y
;
psA
->
rij2
.
z
=
xi
.
z
-
xj2
.
z
;
psA
->
rij3
.
x
=
xi
.
x
-
xj3
.
x
;
psA
->
rij3
.
y
=
xi
.
y
-
xj3
.
y
;
psA
->
rij3
.
z
=
xi
.
z
-
xj3
.
z
;
psA
->
rij1sq
=
psA
->
rij1
.
x
*
psA
->
rij1
.
x
+
psA
->
rij1
.
y
*
psA
->
rij1
.
y
+
psA
->
rij1
.
z
*
psA
->
rij1
.
z
;
psA
->
rij2sq
=
psA
->
rij2
.
x
*
psA
->
rij2
.
x
+
psA
->
rij2
.
y
*
psA
->
rij2
.
y
+
psA
->
rij2
.
z
*
psA
->
rij2
.
z
;
psA
->
rij3sq
=
psA
->
rij3
.
x
*
psA
->
rij3
.
x
+
psA
->
rij3
.
y
*
psA
->
rij3
.
y
+
psA
->
rij3
.
z
*
psA
->
rij3
.
z
;
float
ld1
=
psA
->
d2
-
psA
->
rij1sq
;
float
ld2
=
psA
->
d2
-
psA
->
rij2sq
;
float
ld3
=
psA
->
d2
-
psA
->
rij3sq
;
bool
converged
=
false
;
int
iteration
=
0
;
while
(
iteration
<
15
&&
!
converged
)
{
converged
=
true
;
float3
rpij
;
rpij
.
x
=
xpi
.
x
-
xpj1
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj1
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj1
.
z
;
float
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
float
rrpr
=
psA
->
rij1
.
x
*
rpij
.
x
+
psA
->
rij1
.
y
*
rpij
.
y
+
psA
->
rij1
.
z
*
rpij
.
z
;
float
diff
=
fabs
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij1sq
);
float3
dr
;
dr
.
x
=
psA
->
rij1
.
x
*
acor
;
dr
.
y
=
psA
->
rij1
.
y
*
acor
;
dr
.
z
=
psA
->
rij1
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj1
.
x
-=
dr
.
x
*
invMassJ
;
xpj1
.
y
-=
dr
.
y
*
invMassJ
;
xpj1
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
if
(
atomID
.
z
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj2
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj2
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj2
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij2
.
x
*
rpij
.
x
+
psA
->
rij2
.
y
*
rpij
.
y
+
psA
->
rij2
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij2sq
);
float3
dr
;
dr
.
x
=
psA
->
rij2
.
x
*
acor
;
dr
.
y
=
psA
->
rij2
.
y
*
acor
;
dr
.
z
=
psA
->
rij2
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj2
.
x
-=
dr
.
x
*
invMassJ
;
xpj2
.
y
-=
dr
.
y
*
invMassJ
;
xpj2
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
if
(
atomID
.
w
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj3
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj3
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj3
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij3
.
x
*
rpij
.
x
+
psA
->
rij3
.
y
*
rpij
.
y
+
psA
->
rij3
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij3sq
);
float3
dr
;
dr
.
x
=
psA
->
rij3
.
x
*
acor
;
dr
.
y
=
psA
->
rij3
.
y
*
acor
;
dr
.
z
=
psA
->
rij3
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj3
.
x
-=
dr
.
x
*
invMassJ
;
xpj3
.
y
-=
dr
.
y
*
invMassJ
;
xpj3
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
iteration
++
;
}
#ifndef DeltaShake
xpi
.
x
+=
xi
.
x
;
xpi
.
y
+=
xi
.
y
;
xpi
.
z
+=
xi
.
z
;
xpj1
.
x
+=
xj1
.
x
;
xpj1
.
y
+=
xj1
.
y
;
xpj1
.
z
+=
xj1
.
z
;
xpj2
.
x
+=
xj2
.
x
;
xpj2
.
y
+=
xj2
.
y
;
xpj2
.
z
+=
xj2
.
z
;
xpj3
.
x
+=
xj3
.
x
;
xpj3
.
y
+=
xj3
.
y
;
xpj3
.
z
+=
xj3
.
z
;
#endif
cSim
.
pPosqP
[
atomID
.
x
]
=
xpi
;
cSim
.
pPosqP
[
atomID
.
y
]
=
xpj1
;
if
(
atomID
.
z
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
z
]
=
xpj2
;
if
(
atomID
.
w
!=
-
1
)
cSim
.
pPosqP
[
atomID
.
w
]
=
xpj3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kApplyFirstShake
(
gpuContext
gpu
)
{
// printf("kApplyFirstShake\n");
if
(
gpu
->
sim
.
ShakeConstraints
>
0
)
{
kApplyFirstShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
shake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplyFirstShake"
);
}
}
__global__
void
kUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
#ifndef DeltaShake
float4
apos
=
cSim
.
pPosq
[
pos
];
#endif
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float4
vVector
=
cSim
.
pvVector4
[
pos
];
float4
xVector
;
float4
random4b
=
cSim
.
pRandom4b
[
rpos
+
pos
];
float2
random2b
=
cSim
.
pRandom2b
[
rpos
+
pos
];
float3
Xmh
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
#ifdef DeltaShake
velocity
.
x
=
xPrime
.
x
*
cSim
.
oneOverFix1
;
velocity
.
y
=
xPrime
.
y
*
cSim
.
oneOverFix1
;
velocity
.
z
=
xPrime
.
z
*
cSim
.
oneOverFix1
;
#else
velocity
.
x
=
(
xPrime
.
x
-
apos
.
x
)
*
cSim
.
oneOverFix1
;
velocity
.
y
=
(
xPrime
.
y
-
apos
.
y
)
*
cSim
.
oneOverFix1
;
velocity
.
z
=
(
xPrime
.
z
-
apos
.
z
)
*
cSim
.
oneOverFix1
;
#endif
Xmh
.
x
=
vVector
.
x
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
x
;
Xmh
.
y
=
vVector
.
y
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
y
;
Xmh
.
z
=
vVector
.
z
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
z
;
xVector
.
x
=
sqrtInvMass
*
random4b
.
w
;
xVector
.
y
=
sqrtInvMass
*
random2b
.
x
;
xVector
.
z
=
sqrtInvMass
*
random2b
.
y
;
xPrime
.
x
+=
xVector
.
x
-
Xmh
.
x
;
xPrime
.
y
+=
xVector
.
y
-
Xmh
.
y
;
xPrime
.
z
+=
xVector
.
z
-
Xmh
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
cSim
.
pxVector4
[
pos
]
=
xVector
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
}
__global__
void
kUpdatePart2CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
unsigned
int
rpos
=
cSim
.
pRandomPosition
[
blockIdx
.
x
];
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
#ifndef DeltaShake
float4
apos
=
cSim
.
pPosq
[
pos
];
#endif
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float4
vVector
=
cSim
.
pvVector4
[
pos
];
float4
xVector
;
float4
random4b
=
cSim
.
pRandom4b
[
rpos
+
pos
];
float2
random2b
=
cSim
.
pRandom2b
[
rpos
+
pos
];
float3
Xmh
;
float
mass
=
1.0
f
/
velocity
.
w
;
float
sqrtInvMass
=
sqrt
(
velocity
.
w
);
#ifdef DeltaShake
velocity
.
x
=
xPrime
.
x
*
cSim
.
oneOverFix1
;
velocity
.
y
=
xPrime
.
y
*
cSim
.
oneOverFix1
;
velocity
.
z
=
xPrime
.
z
*
cSim
.
oneOverFix1
;
#else
velocity
.
x
=
(
xPrime
.
x
-
apos
.
x
)
*
cSim
.
oneOverFix1
;
velocity
.
y
=
(
xPrime
.
y
-
apos
.
y
)
*
cSim
.
oneOverFix1
;
velocity
.
z
=
(
xPrime
.
z
-
apos
.
z
)
*
cSim
.
oneOverFix1
;
#endif
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
Xmh
.
x
=
vVector
.
x
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
x
;
Xmh
.
y
=
vVector
.
y
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
y
;
Xmh
.
z
=
vVector
.
z
*
cSim
.
TauDOverEMMinusOne
+
sqrtInvMass
*
random4b
.
z
;
xVector
.
x
=
sqrtInvMass
*
random4b
.
w
;
xVector
.
y
=
sqrtInvMass
*
random2b
.
x
;
xVector
.
z
=
sqrtInvMass
*
random2b
.
y
;
xPrime
.
x
+=
xVector
.
x
-
Xmh
.
x
;
xPrime
.
y
+=
xVector
.
y
-
Xmh
.
y
;
xPrime
.
z
+=
xVector
.
z
-
Xmh
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
cSim
.
pxVector4
[
pos
]
=
xVector
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Update random position pointer
if
(
threadIdx
.
x
==
0
)
{
rpos
+=
cSim
.
paddedNumberOfAtoms
;
if
(
rpos
>
cSim
.
randoms
)
rpos
-=
cSim
.
randoms
;
cSim
.
pRandomPosition
[
blockIdx
.
x
]
=
rpos
;
}
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0.0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
}
extern
void
kGenerateRandoms
(
gpuContext
gpu
);
void
kUpdatePart2
(
gpuContext
gpu
)
{
// printf("kUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kUpdatePart2"
);
}
// Update randoms if necessary
static
int
iteration
=
0
;
iteration
++
;
if
(
iteration
==
gpu
->
sim
.
randomIterations
)
{
kGenerateRandoms
(
gpu
);
iteration
=
0
;
}
}
__global__
void
kApplySecondShake_kernel
()
{
__shared__
Atom
sA
[
G8X_THREADS_PER_BLOCK
];
Atom
*
psA
=
&
sA
[
threadIdx
.
x
];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
ShakeConstraints
)
{
int4
atomID
=
cSim
.
pShakeID
[
pos
];
float4
params
=
cSim
.
pShakeParameter
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
.
x
];
float4
xpi
=
cSim
.
pPosq
[
atomID
.
x
];
float4
apos1
=
cSim
.
pOldPosq
[
atomID
.
y
];
float4
xpj1
=
cSim
.
pPosq
[
atomID
.
y
];
float4
apos2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj2
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
psA
->
InvMassI
=
params
.
x
;
psA
->
M
=
params
.
y
;
psA
->
d2
=
params
.
z
;
float
invMassJ
=
params
.
w
;
if
(
atomID
.
z
!=
-
1
)
{
apos2
=
cSim
.
pOldPosq
[
atomID
.
z
];
xpj2
=
cSim
.
pPosq
[
atomID
.
z
];
}
float4
apos3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
float4
xpj3
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
if
(
atomID
.
w
!=
-
1
)
{
apos3
=
cSim
.
pOldPosq
[
atomID
.
w
];
xpj3
=
cSim
.
pPosq
[
atomID
.
w
];
}
float3
xi
,
xj1
,
xj2
,
xj3
;
xi
.
x
=
apos
.
x
;
xi
.
y
=
apos
.
y
;
xi
.
z
=
apos
.
z
;
xj1
.
x
=
apos1
.
x
;
xj1
.
y
=
apos1
.
y
;
xj1
.
z
=
apos1
.
z
;
xj2
.
x
=
apos2
.
x
;
xj2
.
y
=
apos2
.
y
;
xj2
.
z
=
apos2
.
z
;
xj3
.
x
=
apos3
.
x
;
xj3
.
y
=
apos3
.
y
;
xj3
.
z
=
apos3
.
z
;
#ifndef DeltaShake
xpi
.
x
-=
xi
.
x
;
xpi
.
y
-=
xi
.
y
;
xpi
.
z
-=
xi
.
z
;
xpj1
.
x
-=
xj1
.
x
;
xpj1
.
y
-=
xj1
.
y
;
xpj1
.
z
-=
xj1
.
z
;
xpj2
.
x
-=
xj2
.
x
;
xpj2
.
y
-=
xj2
.
y
;
xpj2
.
z
-=
xj2
.
z
;
xpj3
.
x
-=
xj3
.
x
;
xpj3
.
y
-=
xj3
.
y
;
xpj3
.
z
-=
xj3
.
z
;
#endif
psA
->
rij1
.
x
=
xi
.
x
-
xj1
.
x
;
psA
->
rij1
.
y
=
xi
.
y
-
xj1
.
y
;
psA
->
rij1
.
z
=
xi
.
z
-
xj1
.
z
;
psA
->
rij2
.
x
=
xi
.
x
-
xj2
.
x
;
psA
->
rij2
.
y
=
xi
.
y
-
xj2
.
y
;
psA
->
rij2
.
z
=
xi
.
z
-
xj2
.
z
;
psA
->
rij3
.
x
=
xi
.
x
-
xj3
.
x
;
psA
->
rij3
.
y
=
xi
.
y
-
xj3
.
y
;
psA
->
rij3
.
z
=
xi
.
z
-
xj3
.
z
;
psA
->
rij1sq
=
psA
->
rij1
.
x
*
psA
->
rij1
.
x
+
psA
->
rij1
.
y
*
psA
->
rij1
.
y
+
psA
->
rij1
.
z
*
psA
->
rij1
.
z
;
psA
->
rij2sq
=
psA
->
rij2
.
x
*
psA
->
rij2
.
x
+
psA
->
rij2
.
y
*
psA
->
rij2
.
y
+
psA
->
rij2
.
z
*
psA
->
rij2
.
z
;
psA
->
rij3sq
=
psA
->
rij3
.
x
*
psA
->
rij3
.
x
+
psA
->
rij3
.
y
*
psA
->
rij3
.
y
+
psA
->
rij3
.
z
*
psA
->
rij3
.
z
;
float
ld1
=
psA
->
d2
-
psA
->
rij1sq
;
float
ld2
=
psA
->
d2
-
psA
->
rij2sq
;
float
ld3
=
psA
->
d2
-
psA
->
rij3sq
;
bool
converged
=
false
;
int
iteration
=
0
;
while
(
iteration
<
15
&&
!
converged
)
{
converged
=
true
;
float3
rpij
;
rpij
.
x
=
xpi
.
x
-
xpj1
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj1
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj1
.
z
;
float
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
float
rrpr
=
psA
->
rij1
.
x
*
rpij
.
x
+
psA
->
rij1
.
y
*
rpij
.
y
+
psA
->
rij1
.
z
*
rpij
.
z
;
float
diff
=
fabs
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld1
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij1sq
);
float3
dr
;
dr
.
x
=
psA
->
rij1
.
x
*
acor
;
dr
.
y
=
psA
->
rij1
.
y
*
acor
;
dr
.
z
=
psA
->
rij1
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj1
.
x
-=
dr
.
x
*
invMassJ
;
xpj1
.
y
-=
dr
.
y
*
invMassJ
;
xpj1
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
if
(
atomID
.
z
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj2
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj2
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj2
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij2
.
x
*
rpij
.
x
+
psA
->
rij2
.
y
*
rpij
.
y
+
psA
->
rij2
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld2
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij2sq
);
float3
dr
;
dr
.
x
=
psA
->
rij2
.
x
*
acor
;
dr
.
y
=
psA
->
rij2
.
y
*
acor
;
dr
.
z
=
psA
->
rij2
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj2
.
x
-=
dr
.
x
*
invMassJ
;
xpj2
.
y
-=
dr
.
y
*
invMassJ
;
xpj2
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
if
(
atomID
.
w
!=
-
1
)
{
rpij
.
x
=
xpi
.
x
-
xpj3
.
x
;
rpij
.
y
=
xpi
.
y
-
xpj3
.
y
;
rpij
.
z
=
xpi
.
z
-
xpj3
.
z
;
rpsqij
=
rpij
.
x
*
rpij
.
x
+
rpij
.
y
*
rpij
.
y
+
rpij
.
z
*
rpij
.
z
;
rrpr
=
psA
->
rij3
.
x
*
rpij
.
x
+
psA
->
rij3
.
y
*
rpij
.
y
+
psA
->
rij3
.
z
*
rpij
.
z
;
diff
=
fabs
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
/
(
psA
->
d2
*
cSim
.
shakeTolerance
);
if
(
diff
>=
1.0
f
)
{
float
acor
=
(
ld3
-
2.0
f
*
rrpr
-
rpsqij
)
*
psA
->
M
/
(
rrpr
+
psA
->
rij3sq
);
float3
dr
;
dr
.
x
=
psA
->
rij3
.
x
*
acor
;
dr
.
y
=
psA
->
rij3
.
y
*
acor
;
dr
.
z
=
psA
->
rij3
.
z
*
acor
;
xpi
.
x
+=
dr
.
x
*
psA
->
InvMassI
;
xpi
.
y
+=
dr
.
y
*
psA
->
InvMassI
;
xpi
.
z
+=
dr
.
z
*
psA
->
InvMassI
;
xpj3
.
x
-=
dr
.
x
*
invMassJ
;
xpj3
.
y
-=
dr
.
y
*
invMassJ
;
xpj3
.
z
-=
dr
.
z
*
invMassJ
;
converged
=
false
;
}
}
iteration
++
;
}
xpi
.
x
+=
xi
.
x
;
xpi
.
y
+=
xi
.
y
;
xpi
.
z
+=
xi
.
z
;
xpj1
.
x
+=
xj1
.
x
;
xpj1
.
y
+=
xj1
.
y
;
xpj1
.
z
+=
xj1
.
z
;
xpj2
.
x
+=
xj2
.
x
;
xpj2
.
y
+=
xj2
.
y
;
xpj2
.
z
+=
xj2
.
z
;
xpj3
.
x
+=
xj3
.
x
;
xpj3
.
y
+=
xj3
.
y
;
xpj3
.
z
+=
xj3
.
z
;
cSim
.
pPosq
[
atomID
.
x
]
=
xpi
;
cSim
.
pPosq
[
atomID
.
y
]
=
xpj1
;
if
(
atomID
.
z
!=
-
1
)
cSim
.
pPosq
[
atomID
.
z
]
=
xpj2
;
if
(
atomID
.
w
!=
-
1
)
cSim
.
pPosq
[
atomID
.
w
]
=
xpj3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kApplyNoShake_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
while
(
pos
<
cSim
.
NonShakeConstraints
)
{
int
atomID
=
cSim
.
pNonShakeID
[
pos
];
float4
apos
=
cSim
.
pOldPosq
[
atomID
];
float4
xpi
=
cSim
.
pPosq
[
atomID
];
xpi
.
x
+=
apos
.
x
;
xpi
.
y
+=
apos
.
y
;
xpi
.
z
+=
apos
.
z
;
cSim
.
pPosq
[
atomID
]
=
xpi
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kCPUShake2
(
gpuContext
gpu
)
{
}
void
kApplySecondShake
(
gpuContext
gpu
)
{
// printf("kApplySecondShake\n");
// kCPUShake2(gpu);
if
(
gpu
->
sim
.
ShakeConstraints
>
0
)
{
kApplySecondShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
shake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplySecondShake"
);
}
// handle non-Shake atoms
#ifdef DeltaShake
if
(
gpu
->
sim
.
NonShakeConstraints
>
0
)
{
//fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
kApplyNoShake_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
nonshake_threads_per_block
>>>
();
LAUNCHERROR
(
"kApplyNoShake"
);
}
#endif
}
platforms/cuda/src/kernels/kVerletUpdate.cu
0 → 100755
View file @
38f6c8f8
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using
namespace
std
;
#include "gputypes.h"
#define DeltaShake
static
__constant__
cudaGmxSimulation
cSim
;
void
SetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
}
void
GetVerletUpdateSim
(
gpuContext
gpu
)
{
cudaError_t
status
;
status
=
cudaMemcpyFromSymbol
(
&
gpu
->
sim
,
cSim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"cudaMemcpyFromSymbol: SetSim copy from cSim failed"
);
}
__global__
void
kVerletUpdatePart1_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
dtOverMass
=
cSim
.
deltaT
*
velocity
.
w
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
velocity
.
x
+=
dtOverMass
*
force
.
x
;
velocity
.
y
+=
dtOverMass
*
force
.
y
;
velocity
.
z
+=
dtOverMass
*
force
.
z
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
+=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
+=
velocity
.
z
*
cSim
.
deltaT
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
=
velocity
.
z
*
cSim
.
deltaT
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kVerletUpdatePart1CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
float4
CM1
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
// Read CM outputs from previous step
unsigned
int
cpos
=
threadIdx
.
x
;
while
(
cpos
<
gridDim
.
x
)
{
CM1
=
cSim
.
pLinearMomentum
[
cpos
];
CM
.
x
+=
CM1
.
x
;
CM
.
y
+=
CM1
.
y
;
CM
.
z
+=
CM1
.
z
;
cpos
+=
blockDim
.
x
;
}
sCM
[
threadIdx
.
x
].
x
=
CM
.
x
;
sCM
[
threadIdx
.
x
].
y
=
CM
.
y
;
sCM
[
threadIdx
.
x
].
z
=
CM
.
z
;
__syncthreads
();
// Reduce CM
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
while
(
pos
<
cSim
.
atoms
)
{
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
force
=
cSim
.
pForce4
[
pos
];
float
dtOverMass
=
cSim
.
deltaT
*
velocity
.
w
;
cSim
.
pOldPosq
[
pos
]
=
apos
;
velocity
.
x
+=
dtOverMass
*
force
.
x
-
sCM
[
0
].
x
;
velocity
.
y
+=
dtOverMass
*
force
.
y
-
sCM
[
0
].
y
;
velocity
.
z
+=
dtOverMass
*
force
.
z
-
sCM
[
0
].
z
;
#ifndef DeltaShake
apos
.
x
+=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
+=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
+=
velocity
.
z
*
cSim
.
deltaT
;
#else
apos
.
x
=
velocity
.
x
*
cSim
.
deltaT
;
apos
.
y
=
velocity
.
y
*
cSim
.
deltaT
;
apos
.
z
=
velocity
.
z
*
cSim
.
deltaT
;
#endif
cSim
.
pPosqP
[
pos
]
=
apos
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
void
kVerletUpdatePart1
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart1\n");
if
(
gpu
->
bRemoveCM
)
{
kVerletUpdatePart1CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1CM"
);
gpu
->
bRemoveCM
=
false
;
}
else
{
kVerletUpdatePart1_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart1"
);
}
}
__global__
void
kVerletUpdatePart2_kernel
()
{
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
__global__
void
kVerletUpdatePart2CM_kernel
()
{
extern
__shared__
float3
sCM
[];
unsigned
int
pos
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
float3
CM
=
{
0.0
f
,
0.0
f
,
0.0
f
};
__syncthreads
();
while
(
pos
<
cSim
.
atoms
)
{
float4
velocity
=
cSim
.
pVelm4
[
pos
];
float4
apos
=
cSim
.
pPosq
[
pos
];
float4
xPrime
=
cSim
.
pPosqP
[
pos
];
float
mass
=
1.0
f
/
velocity
.
w
;
#ifndef DeltaShake
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
-
apos
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
-
apos
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
-
apos
.
z
);
#else
velocity
.
x
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
x
);
velocity
.
y
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
y
);
velocity
.
z
=
cSim
.
oneOverDeltaT
*
(
xPrime
.
z
);
xPrime
.
x
+=
apos
.
x
;
xPrime
.
y
+=
apos
.
y
;
xPrime
.
z
+=
apos
.
z
;
#endif
CM
.
x
+=
mass
*
velocity
.
x
;
CM
.
y
+=
mass
*
velocity
.
y
;
CM
.
z
+=
mass
*
velocity
.
z
;
cSim
.
pPosq
[
pos
]
=
xPrime
;
cSim
.
pVelm4
[
pos
]
=
velocity
;
pos
+=
blockDim
.
x
*
gridDim
.
x
;
}
// Scale CM
CM
.
x
*=
cSim
.
inverseTotalMass
;
CM
.
y
*=
cSim
.
inverseTotalMass
;
CM
.
z
*=
cSim
.
inverseTotalMass
;
sCM
[
threadIdx
.
x
]
=
CM
;
__syncthreads
();
// Reduce CM for CTA
unsigned
int
offset
=
1
;
unsigned
int
mask
=
1
;
while
(
offset
<
blockDim
.
x
)
{
if
(((
threadIdx
.
x
&
mask
)
==
0
)
&&
(
threadIdx
.
x
+
offset
<
blockDim
.
x
))
{
sCM
[
threadIdx
.
x
].
x
+=
sCM
[
threadIdx
.
x
+
offset
].
x
;
sCM
[
threadIdx
.
x
].
y
+=
sCM
[
threadIdx
.
x
+
offset
].
y
;
sCM
[
threadIdx
.
x
].
z
+=
sCM
[
threadIdx
.
x
+
offset
].
z
;
}
mask
=
2
*
mask
+
1
;
offset
*=
2
;
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
float4
CM
;
CM
.
x
=
sCM
[
0
].
x
;
CM
.
y
=
sCM
[
0
].
y
;
CM
.
z
=
sCM
[
0
].
z
;
CM
.
w
=
0.0
f
;
cSim
.
pLinearMomentum
[
blockIdx
.
x
]
=
CM
;
}
}
void
kVerletUpdatePart2
(
gpuContext
gpu
)
{
// printf("kVerletUpdatePart2\n");
if
(
gpu
->
bCalculateCM
)
{
kVerletUpdatePart2CM_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
,
gpu
->
sim
.
update_threads_per_block
*
sizeof
(
float3
)
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2CM"
);
gpu
->
bCalculateCM
=
false
;
gpu
->
bRemoveCM
=
true
;
}
else
{
kVerletUpdatePart2_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kVerletUpdatePart2"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment