Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
cb130f92
"platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp" did not exist on "85da5e0f9018719e95350c76b483a2160c84d9d3"
Commit
cb130f92
authored
Sep 25, 2008
by
Mark Friedrichs
Browse files
Mods
parent
cc8b4de0
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2220 additions
and
373 deletions
+2220
-373
platforms/brook/src/BrookBonded.cpp
platforms/brook/src/BrookBonded.cpp
+273
-0
platforms/brook/src/BrookBonded.h
platforms/brook/src/BrookBonded.h
+52
-0
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.cpp
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.cpp
+2
-3
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.h
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.h
+1
-1
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.cpp
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.cpp
+3
-4
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.h
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.h
+1
-1
platforms/brook/src/BrookFloatStreamInternal.h
platforms/brook/src/BrookFloatStreamInternal.h
+1
-1
platforms/brook/src/BrookPlatform.cpp
platforms/brook/src/BrookPlatform.cpp
+11
-1
platforms/brook/src/BrookRandomNumberGenerator.cpp
platforms/brook/src/BrookRandomNumberGenerator.cpp
+2
-6
platforms/brook/src/BrookShakeAlgorithm.cpp
platforms/brook/src/BrookShakeAlgorithm.cpp
+0
-4
platforms/brook/src/BrookStochasticDynamics.cpp
platforms/brook/src/BrookStochasticDynamics.cpp
+4
-4
platforms/brook/src/BrookStreamInternal.h
platforms/brook/src/BrookStreamInternal.h
+0
-1
platforms/brook/src/gpu/Release/gmxgpu.lib
platforms/brook/src/gpu/Release/gmxgpu.lib
+0
-0
platforms/brook/src/gpu/gmxgpu.lib
platforms/brook/src/gpu/gmxgpu.lib
+0
-0
platforms/brook/src/gpu/gmxgpu_d.lib
platforms/brook/src/gpu/gmxgpu_d.lib
+0
-0
platforms/brook/src/gpu/invmap.cpp
platforms/brook/src/gpu/invmap.cpp
+0
-237
platforms/brook/src/gpu/kObcBaseD2.br
platforms/brook/src/gpu/kObcBaseD2.br
+1855
-0
platforms/brook/src/gpu/kangles.br
platforms/brook/src/gpu/kangles.br
+0
-76
platforms/brook/src/gpu/kcommon.h
platforms/brook/src/gpu/kcommon.h
+15
-0
platforms/brook/src/gpu/kcommonOrig.br
platforms/brook/src/gpu/kcommonOrig.br
+0
-34
No files found.
platforms/brook/src/BrookBonded.cpp
View file @
cb130f92
...
...
@@ -1536,3 +1536,276 @@ std::string BrookBonded::getContentsString( int level ) const {
return
message
.
str
();
}
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* @param posflag 0-niatoms-1
* @param niatoms 3 for angles, 4 for torsions, impropers
* @param nints number of interactions
* @param natoms number of atoms
* @param *atoms gromacs interaction list
* @param nmaps maximum number of inverse maps
* @param counts[] output counts of how many places each atom occurs
* @param *invmaps[] output array of nmaps inverse maps
* @param *nimaps, output max number of inverse maps actually used
*
* @return DefaultReturnValue, unless error in which case exits w/ OpenMM exception
*
**/
int
BrookBonded
::
gpuCalcInvMap
(
int
posflag
,
int
niatoms
,
int
nints
,
int
natoms
,
int
*
atoms
,
int
nmaps
,
int
counts
[],
float4
*
invmaps
[],
int
*
nimaps
){
// ---------------------------------------------------------------------------------------
int
i
,
j
;
int
atom
;
int
mapnum
,
mapcomp
;
static
const
std
::
string
methodName
=
"BrookBonded::gpuCalcInvMap"
;
static
const
unsigned
int
MAX_LINE_CHARS
=
256
;
//char value[MAX_LINE_CHARS];
static
const
char
*
Set
=
"Set"
;
static
const
char
*
NotSet
=
"Not set"
;
static
const
int
PrintOn
=
0
;
// ---------------------------------------------------------------------------------------
memset
(
counts
,
0
,
sizeof
(
int
)
*
natoms
);
for
(
i
=
0
;
i
<
nmaps
;
i
++
){
for
(
j
=
0
;
j
<
natoms
;
j
++
){
invmaps
[
i
][
j
]
=
float4
(
-
1.0
,
-
1.0
,
-
1.0
,
-
1.0
);
}
}
//This will hold the number of imaps actually used
*
nimaps
=
-
1
;
//Now note down the positions where each atom occurs
if
(
PrintOn
&&
getLog
()
){
(
void
)
fprintf
(
getLog
(),
"%s: pos=%d ni=%d nints=%d natoms=%d nmaps=<%d>
\n
"
,
methodName
.
c_str
(),
posflag
,
niatoms
,
nints
,
natoms
,
nmaps
);
(
void
)
fflush
(
getLog
()
);
}
int
atomRange
[
2
]
=
{
90000000
,
-
90000000
};
int
mapnumRange
[
2
]
=
{
90000000
,
-
90000000
};
for
(
i
=
0
;
i
<
nints
;
i
++
){
//This is our atom
atom
=
atoms
[
(
niatoms
+
1
)
*
i
+
posflag
+
1
];
//Special for merged bondeds
if
(
atom
==
-
1
){
continue
;
}
if
(
atom
<
atomRange
[
0
]
){
atomRange
[
0
]
=
atom
;
}
if
(
atom
>
atomRange
[
1
]
){
atomRange
[
1
]
=
atom
;
}
//Check to make sure we're inside the limits
if
(
counts
[
atom
]
>
nmaps
*
4
){
if
(
PrintOn
&&
getLog
()
){
(
void
)
fprintf
(
getLog
(),
"%s Atom %d has too many proper dihedrals(%d, max %d)
\n
"
,
methodName
.
c_str
(),
atom
,
counts
[
atom
],
nmaps
*
4
);
(
void
)
fflush
(
getLog
()
);
}
std
::
stringstream
message
;
message
<<
methodName
<<
" Atom "
<<
atom
<<
" has too many proper dihedrals; valid range:("
<<
counts
[
atom
]
<<
", "
<<
nmaps
*
4
<<
")"
;
throw
OpenMMException
(
message
.
str
()
);
}
//Which invmap will this go into
mapnum
=
counts
[
atom
]
/
4
;
if
(
mapnum
>
*
nimaps
)
*
nimaps
=
mapnum
;
//Which component will it be
mapcomp
=
counts
[
atom
]
%
4
;
//Set it
//This is silly, but otherwise I have to declare it as float*
//and things get even more confusing. :)
switch
(
mapcomp
){
case
0
:
invmaps
[
mapnum
][
atom
].
x
=
(
float
)
i
;
break
;
case
1
:
invmaps
[
mapnum
][
atom
].
y
=
(
float
)
i
;
break
;
case
2
:
invmaps
[
mapnum
][
atom
].
z
=
(
float
)
i
;
break
;
case
3
:
invmaps
[
mapnum
][
atom
].
w
=
(
float
)
i
;
break
;
default:
if
(
PrintOn
&&
getLog
()
){
(
void
)
fprintf
(
getLog
(),
"mapcomp %d invalid -- impossible!
\n
"
,
mapcomp
);
(
void
)
fflush
(
getLog
()
);
}
std
::
stringstream
message
;
message
<<
methodName
<<
" mapcomp "
<<
mapcomp
<<
" invalid -- actually impossible!"
;
throw
OpenMMException
(
message
.
str
()
);
break
;
}
counts
[
atom
]
++
;
if
(
mapnum
<
mapnumRange
[
0
]
){
mapnumRange
[
0
]
=
mapnum
;
}
if
(
mapnum
>
mapnumRange
[
1
]
){
mapnumRange
[
1
]
=
mapnum
;
}
//fprintf( gpu->log, "%d atom=%d mapcomp=%d counts[]=%d mapnum=%d\n", i, atom, mapcomp, counts[atom], mapnum );
}
(
*
nimaps
)
++
;
if
(
PrintOn
&&
getLog
()
){
(
void
)
fprintf
(
getLog
(),
"%s mnmaps=%d Ranges: atom [%d %d] mapnum [%d %d]
\n
"
,
methodName
.
c_str
(),
*
nimaps
,
atomRange
[
0
],
atomRange
[
1
],
mapnumRange
[
0
],
mapnumRange
[
1
]
);
(
void
)
fflush
(
getLog
()
);
}
return
DefaultReturnValue
;
}
void
BrookBonded
::
gpuPrintInvMaps
(
int
nmaps
,
int
natoms
,
int
counts
[],
float4
*
invmap
[],
FILE
*
logFile
){
int
i
;
int
j
;
for
(
i
=
0
;
i
<
natoms
;
i
++
){
fprintf
(
logFile
,
"%d %d "
,
i
,
counts
[
i
]
);
for
(
j
=
0
;
j
<
nmaps
;
j
++
){
fprintf
(
logFile
,
"%6.0f %6.0f %6.0f %6.0f"
,
invmap
[
j
][
i
].
x
,
invmap
[
j
][
i
].
y
,
invmap
[
j
][
i
].
z
,
invmap
[
j
][
i
].
w
);
}
fprintf
(
logFile
,
"
\n
"
);
}
}
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int
BrookBonded
::
gpuCalcInvMap_merged
(
int
nints
,
//number of interactions
int
natoms
,
//number of atoms
int
*
atoms
,
//ijkl,ijkl,ijkl...
int
nmaps
,
//maximum number of inverse maps
int
counts
[],
//output counts of how many places each atom occurs
float4
*
invmaps
[],
//output array of nmaps inverse maps
int
*
nimaps
//output max number of inverse maps actually used
){
int
i
,
j
;
int
atom
;
int
mapnum
,
mapcomp
;
int
pos
;
for
(
i
=
0
;
i
<
natoms
;
i
++
)
counts
[
i
]
=
0
;
for
(
i
=
0
;
i
<
nmaps
;
i
++
){
for
(
j
=
0
;
j
<
natoms
;
j
++
){
invmaps
[
i
][
j
]
=
float4
(
-
1.0
,
-
1.0
,
-
1.0
,
-
1.0
);
}
}
//This will hold the number of imaps actually used
*
nimaps
=
-
1
;
//For each atom
for
(
i
=
0
;
i
<
nints
;
i
++
){
for
(
j
=
0
;
j
<
4
;
j
++
){
atom
=
atoms
[
i
*
4
+
j
];
if
(
atom
==
-
1
){
//Nothing to be done for this atom, go to next
continue
;
}
//Which map
mapnum
=
counts
[
atom
]
/
4
;
//Make sure we have space
if
(
mapnum
>=
nmaps
){
printf
(
"Atom %d has too many bondeds(%d, max %d)
\n
"
,
atom
,
counts
[
atom
],
nmaps
*
4
);
return
0
;
}
if
(
mapnum
>
*
nimaps
){
*
nimaps
=
mapnum
;
}
//Which component
mapcomp
=
counts
[
atom
]
%
4
;
//Encode target stream and position
pos
=
100000
*
j
+
i
;
switch
(
mapcomp
){
case
0
:
invmaps
[
mapnum
][
atom
].
x
=
(
float
)
pos
;
break
;
case
1
:
invmaps
[
mapnum
][
atom
].
y
=
(
float
)
pos
;
break
;
case
2
:
invmaps
[
mapnum
][
atom
].
z
=
(
float
)
pos
;
break
;
case
3
:
invmaps
[
mapnum
][
atom
].
w
=
(
float
)
pos
;
break
;
}
counts
[
atom
]
++
;
}
}
(
*
nimaps
)
++
;
return
1
;
}
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int
BrookBonded
::
gpuRepackInvMap_merged
(
int
natoms
,
int
nmaps
,
int
*
counts
,
float4
*
invmaps
[],
float4
*
buf
){
int
i
,
j
;
int
nmaps_i
;
for
(
i
=
0
;
i
<
natoms
;
i
++
){
for
(
j
=
0
;
j
<
nmaps
;
j
++
){
buf
[
i
+
j
*
natoms
]
=
float4
(
-
1.0
f
,
-
1.0
f
,
-
1.0
f
,
-
1.0
f
);
}
}
for
(
i
=
0
;
i
<
natoms
;
i
++
){
nmaps_i
=
counts
[
i
]
/
4
;
if
(
counts
[
i
]
%
4
)
nmaps_i
++
;
for
(
j
=
0
;
j
<
nmaps_i
;
j
++
){
buf
[
i
+
j
*
natoms
]
=
invmaps
[
j
][
i
];
}
}
return
1
;
}
platforms/brook/src/BrookBonded.h
View file @
cb130f92
...
...
@@ -421,6 +421,58 @@ class BrookBonded : public BrookCommon {
int
validateInverseMapStreamCount
(
int
index
,
int
count
)
const
;
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* @param posflag 0-niatoms-1
* @param niatoms 3 for angles, 4 for torsions, impropers
* @param nints number of interactions
* @param natoms number of atoms
* @param *atoms gromacs interaction list
* @param nmaps maximum number of inverse maps
* @param counts[] output counts of how many places each atom occurs
* @param *invmaps[] output array of nmaps inverse maps
* @param *nimaps, output max number of inverse maps actually used
*
* @return DefaultReturnValue, unless error in which case exits w/ OpenMM exception
*
**/
int
gpuCalcInvMap
(
int
posflag
,
int
niatoms
,
int
nints
,
int
natoms
,
int
*
atoms
,
int
nmaps
,
int
counts
[],
float4
*
invmaps
[],
int
*
nimaps
);
void
gpuPrintInvMaps
(
int
nmaps
,
int
natoms
,
int
counts
[],
float4
*
invmap
[],
FILE
*
logFile
);
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int
gpuCalcInvMap_merged
(
int
nints
,
int
natoms
,
int
*
atoms
,
int
nmaps
,
int
counts
[],
float4
*
invmaps
[],
int
*
nimaps
);
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int
gpuRepackInvMap_merged
(
int
natoms
,
int
nmaps
,
int
*
counts
,
float4
*
invmaps
[],
float4
*
buf
);
};
}
// namespace OpenMM
...
...
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.cpp
View file @
cb130f92
...
...
@@ -36,9 +36,8 @@
#include "BrookStreamImpl.h"
#include "BrookCalcGBSAOBCForceFieldKernel.h"
#include "force.h"
#include "kgbsa.h"
#include "kforce.h"
#include "gpu/kgbsa.h"
#include "gpu/kforce.h"
#include "math.h"
using
namespace
OpenMM
;
...
...
platforms/brook/src/BrookCalcGBSAOBCForceFieldKernel.h
View file @
cb130f92
...
...
@@ -33,7 +33,7 @@
* -------------------------------------------------------------------------- */
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "
../../reference/src/
SimTKUtilities/SimTKOpenMMRealType.h"
#include "BrookGbsa.h"
namespace
OpenMM
{
...
...
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.cpp
View file @
cb130f92
...
...
@@ -36,10 +36,9 @@
#include "BrookStreamImpl.h"
#include "BrookCalcStandardMMForceFieldKernel.h"
#include "kforce.h"
#include "kinvmap_gather.h"
#include "
gpu/
kforce.h"
#include "
gpu/
kinvmap_gather.h"
#include "ReferencePlatform.h"
#include "ReferenceFloatStreamImpl.h"
#include "VerletIntegrator.h"
#include "StandardMMForceField.h"
...
...
@@ -254,7 +253,7 @@ void BrookCalcStandardMMForceFieldKernel::initialize(
const
vector
<
int
>
periodicTorsionInd
=
*
periodicTorsionI_it
;
const
vector
<
double
>
periodicTorsionPrm
=
*
periodicTorsionP_it
;
_refForceField
->
setPeriodicTorsionParameters
(
ii
,
periodicTorsionInd
[
0
],
periodicTorsionInd
[
1
],
periodicTorsionInd
[
2
],
periodicTorsionInd
[
3
],
periodicTorsionPrm
[
2
],
periodicTorsionPrm
[
1
],
periodicTorsionPrm
[
0
]
);
(
int
)
(
periodicTorsionPrm
[
2
]
+
0.001
)
,
periodicTorsionPrm
[
1
],
periodicTorsionPrm
[
0
]
);
/*
printf( "PeriodicTor: [%d %d %d %d] [%.5e %.5e %.5e]\n", periodicTorsionInd[0], periodicTorsionInd[1], periodicTorsionInd[2], periodicTorsionInd[3],
periodicTorsionPrm[2], periodicTorsionPrm[1], periodicTorsionPrm[0] ); fflush( stdout );
...
...
platforms/brook/src/BrookCalcStandardMMForceFieldKernel.h
View file @
cb130f92
...
...
@@ -33,7 +33,7 @@
* -------------------------------------------------------------------------- */
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "
../../reference/src/
SimTKUtilities/SimTKOpenMMRealType.h"
#include "BrookBonded.h"
#include "BrookNonBonded.h"
#include "StandardMMForceField.h"
...
...
platforms/brook/src/BrookFloatStreamInternal.h
View file @
cb130f92
...
...
@@ -34,7 +34,7 @@
#include "BrookPlatform.h"
#include "BrookStreamInternal.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "
../../reference/src/
SimTKUtilities/SimTKOpenMMRealType.h"
namespace
OpenMM
{
...
...
platforms/brook/src/BrookPlatform.cpp
View file @
cb130f92
...
...
@@ -33,7 +33,7 @@
#include "BrookKernelFactory.h"
#include "OpenMMException.h"
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "
../../reference/src/
SimTKUtilities/SimTKOpenMMRealType.h"
#include <brook/brook.hpp>
#include <stdlib.h>
#include <sstream>
...
...
@@ -83,10 +83,20 @@ BrookPlatform::BrookPlatform( ){
// get Brook runtime
#ifdef WIN32
char
*
runtime
;
size_t
numberOfEnv
;
_dupenv_s
(
&
runtime
,
&
numberOfEnv
,
"brt_runtime"
);
#else
char
*
runtime
=
getenv
(
"brt_runtime"
);
#endif
_initializeKernelFactory
(
);
_setBrookRuntime
(
runtime
);
#ifdef WIN32
free
(
runtime
);
#endif
}
/**
...
...
platforms/brook/src/BrookRandomNumberGenerator.cpp
View file @
cb130f92
...
...
@@ -32,11 +32,7 @@
#include <sstream>
#include "BrookRandomNumberGenerator.h"
#include "OpenMMException.h"
#include "kupdatesd.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
#include "gpu/kupdatesd.h"
using
namespace
OpenMM
;
using
namespace
std
;
...
...
@@ -244,7 +240,7 @@ inline int MaxInt( unsigned int x, unsigned int y ){ return x > y ? x : y; }
*/
void
BrookRandomNumberGenerator
::
_generateRandomsKiss
(
float
*
randomV1
,
float
*
randomV2
,
float
*
randomV3
,
unsigned
int
state
[
4
]
){
unsigned
int
state
[
4
]
){
// ---------------------------------------------------------------------------------------
...
...
platforms/brook/src/BrookShakeAlgorithm.cpp
View file @
cb130f92
...
...
@@ -35,10 +35,6 @@
#include "OpenMMException.h"
#include "BrookStreamImpl.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
using
namespace
OpenMM
;
using
namespace
std
;
...
...
platforms/brook/src/BrookStochasticDynamics.cpp
View file @
cb130f92
...
...
@@ -34,13 +34,13 @@
#include "BrookPlatform.h"
#include "OpenMMException.h"
#include "BrookStreamImpl.h"
#include "kshakeh.h"
#include "kupdatesd.h"
#include "kcommon.h"
#include "
gpu/
kshakeh.h"
#include "
gpu/
kupdatesd.h"
#include "
gpu/
kcommon.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
#include "
../../reference/src/SimTKUtilities/
SimTKOpenMMUtilities.h"
using
namespace
OpenMM
;
using
namespace
std
;
...
...
platforms/brook/src/BrookStreamInternal.h
View file @
cb130f92
...
...
@@ -33,7 +33,6 @@
* -------------------------------------------------------------------------- */
#include <brook/brook.hpp>
#include "SimTKUtilities/SimTKOpenMMRealType.h"
namespace
OpenMM
{
...
...
platforms/brook/src/gpu/Release/gmxgpu.lib
deleted
100755 → 0
View file @
cc8b4de0
File deleted
platforms/brook/src/gpu/gmxgpu.lib
deleted
100755 → 0
View file @
cc8b4de0
File deleted
platforms/brook/src/gpu/gmxgpu_d.lib
deleted
100755 → 0
View file @
cc8b4de0
File deleted
platforms/brook/src/gpu/invmap.cpp
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
#include <stdio.h>
#include <brook/brook.hpp>
// #include "typedefs.h"
#include "invmap.h"
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* */
/*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* */
int
gpuCalcInvMap
(
int
posflag
,
//0-niatoms-1
int
niatoms
,
//3 for angles, 4 for torsions, impropers
int
nints
,
//number of interactions
int
natoms
,
//number of atoms
int
*
atoms
,
//gromacs interaction list
int
nmaps
,
//maximum number of inverse maps
int
counts
[],
//output counts of how many places each atom occurs
float4
*
invmaps
[],
//output array of nmaps inverse maps
int
*
nimaps
//output max number of inverse maps actually used
)
{
int
i
,
j
;
int
atom
;
int
mapnum
,
mapcomp
;
for
(
i
=
0
;
i
<
natoms
;
i
++
)
counts
[
i
]
=
0
;
for
(
i
=
0
;
i
<
nmaps
;
i
++
)
{
for
(
j
=
0
;
j
<
natoms
;
j
++
)
{
invmaps
[
i
][
j
]
=
float4
(
-
1.0
,
-
1.0
,
-
1.0
,
-
1.0
);
}
}
//This will hold the number of imaps actually used
*
nimaps
=
-
1
;
printf
(
"gpuCalcInvMap: posflag=%d niatoms=%d nints=%d natoms=%d nmaps=%d
\n
"
,
posflag
,
niatoms
,
nints
,
natoms
,
nmaps
);
//Now note down the positions where each atom occurs
for
(
i
=
0
;
i
<
nints
;
i
++
)
{
//This is our atom
atom
=
atoms
[
(
niatoms
+
1
)
*
i
+
posflag
+
1
];
//Special for merged bondeds
if
(
atom
==
-
1
)
{
continue
;
}
//Check to make sure we're inside the limits
if
(
counts
[
atom
]
>
nmaps
*
4
)
{
printf
(
"Atom %d has too many proper dihedrals(%d, max %d)
\n
"
,
atom
,
counts
[
atom
],
nmaps
*
4
);
return
0
;
}
//Which invmap will this go into
mapnum
=
counts
[
atom
]
/
4
;
if
(
mapnum
>
*
nimaps
)
*
nimaps
=
mapnum
;
//Which component will it be
mapcomp
=
counts
[
atom
]
%
4
;
//Set it
//This is silly, but otherwise I have to declare it as float*
//and things get even more confusing. :)
switch
(
mapcomp
)
{
case
0
:
invmaps
[
mapnum
][
atom
].
x
=
(
float
)
i
;
break
;
case
1
:
invmaps
[
mapnum
][
atom
].
y
=
(
float
)
i
;
break
;
case
2
:
invmaps
[
mapnum
][
atom
].
z
=
(
float
)
i
;
break
;
case
3
:
invmaps
[
mapnum
][
atom
].
w
=
(
float
)
i
;
break
;
}
counts
[
atom
]
++
;
printf
(
"Atom %d count=%d max %d mapcomp=%d val=%d mapnum=%d
\n
"
,
atom
,
counts
[
atom
],
nmaps
*
4
,
mapcomp
,
i
,
mapnum
);
}
(
*
nimaps
)
++
;
return
1
;
}
void
gpuPrintInvMaps
(
int
nmaps
,
int
natoms
,
int
counts
[],
float4
*
invmap
[],
FILE
*
logFile
)
{
int
i
;
int
j
;
for
(
i
=
0
;
i
<
natoms
;
i
++
)
{
fprintf
(
logFile
,
"%d %d "
,
i
,
counts
[
i
]
);
for
(
j
=
0
;
j
<
nmaps
;
j
++
)
{
fprintf
(
logFile
,
"%6.0f %6.0f %6.0f %6.0f"
,
invmap
[
j
][
i
].
x
,
invmap
[
j
][
i
].
y
,
invmap
[
j
][
i
].
z
,
invmap
[
j
][
i
].
w
);
}
fprintf
(
logFile
,
"
\n
"
);
}
}
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int
gpuCalcInvMap_merged
(
int
nints
,
//number of interactions
int
natoms
,
//number of atoms
int
*
atoms
,
//ijkl,ijkl,ijkl...
int
nmaps
,
//maximum number of inverse maps
int
counts
[],
//output counts of how many places each atom occurs
float4
*
invmaps
[],
//output array of nmaps inverse maps
int
*
nimaps
//output max number of inverse maps actually used
)
{
int
i
,
j
;
int
atom
;
int
mapnum
,
mapcomp
;
int
pos
;
for
(
i
=
0
;
i
<
natoms
;
i
++
)
counts
[
i
]
=
0
;
for
(
i
=
0
;
i
<
nmaps
;
i
++
)
{
for
(
j
=
0
;
j
<
natoms
;
j
++
)
{
invmaps
[
i
][
j
]
=
float4
(
-
1.0
,
-
1.0
,
-
1.0
,
-
1.0
);
}
}
//This will hold the number of imaps actually used
*
nimaps
=
-
1
;
//For each atom
for
(
i
=
0
;
i
<
nints
;
i
++
)
{
for
(
j
=
0
;
j
<
4
;
j
++
)
{
atom
=
atoms
[
i
*
4
+
j
];
if
(
atom
==
-
1
)
{
//Nothing to be done for this atom, go to next
continue
;
}
//Which map
mapnum
=
counts
[
atom
]
/
4
;
//Make sure we have space
if
(
mapnum
>=
nmaps
)
{
printf
(
"Atom %d has too many bondeds(%d, max %d)
\n
"
,
atom
,
counts
[
atom
],
nmaps
*
4
);
return
0
;
}
if
(
mapnum
>
*
nimaps
)
{
*
nimaps
=
mapnum
;
}
//Which component
mapcomp
=
counts
[
atom
]
%
4
;
//Encode target stream and position
pos
=
100000
*
j
+
i
;
switch
(
mapcomp
)
{
case
0
:
invmaps
[
mapnum
][
atom
].
x
=
(
float
)
pos
;
break
;
case
1
:
invmaps
[
mapnum
][
atom
].
y
=
(
float
)
pos
;
break
;
case
2
:
invmaps
[
mapnum
][
atom
].
z
=
(
float
)
pos
;
break
;
case
3
:
invmaps
[
mapnum
][
atom
].
w
=
(
float
)
pos
;
break
;
}
counts
[
atom
]
++
;
}
}
(
*
nimaps
)
++
;
return
1
;
}
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int
gpuRepackInvMap_merged
(
int
natoms
,
int
nmaps
,
int
*
counts
,
float4
*
invmaps
[],
float4
*
buf
)
{
int
i
,
j
;
int
nmaps_i
;
for
(
i
=
0
;
i
<
natoms
;
i
++
)
{
for
(
j
=
0
;
j
<
nmaps
;
j
++
)
{
buf
[
i
+
j
*
natoms
]
=
float4
(
-
1.0
f
,
-
1.0
f
,
-
1.0
f
,
-
1.0
f
);
}
}
for
(
i
=
0
;
i
<
natoms
;
i
++
)
{
nmaps_i
=
counts
[
i
]
/
4
;
if
(
counts
[
i
]
%
4
)
nmaps_i
++
;
for
(
j
=
0
;
j
<
nmaps_i
;
j
++
)
{
buf
[
i
+
j
*
natoms
]
=
invmaps
[
j
][
i
];
}
}
return
1
;
}
platforms/brook/src/gpu/kObcBaseD2.br
0 → 100644
View file @
cb130f92
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop2Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = scaledRadiusJ2;
// de = mask*t3;
// de = mask*rPlusScaledJ;
// de = mask*r2;
// de = mask;
// de = mask*rInverse;
// de = mask*bornForce;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop2InternalR2( float4 r2, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = scaledRadiusJ2;
// de = mask*t3;
// de = mask*rPlusScaledJ;
// de = mask*r2;
// de = mask;
// de = mask*rInverse;
// de = mask*bornForce;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */
kernel void loop2InternalNoSum( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
// mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = bornForce;
}
// This kernel includes terms dL/dr & dU/dr from paper
// these values are zero, but included here to confirm that is the case
kernel void loop2InternalX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2, l_ij3;
float4 u_ij, u_ij2, u_ij3;
float4 t2,t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask, t2Mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated (t2) w/ dL/dr & dU/dr are zero; included here to test that true
// loop2Internal() should be used
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
l_ij3 = l_ij2*l_ij;
// t1 in Tinker code
t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r);
t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4;
t3 += t2Mask;
// t2 in Tinker code
u_ij3 = u_ij2*u_ij;
t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r);
t3 += t2;
de = mask*bornForce*t3*rInverse;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */
kernel void loop2InternalNoSumX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2, l_ij3;
float4 u_ij, u_ij2, u_ij3;
float4 t2, t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask, t2Mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
// mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
l_ij3 = l_ij2*l_ij;
// t1 in Tinker code
t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r);
t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4;
t3 += t2Mask;
// t2 in Tinker code
u_ij3 = u_ij2*u_ij;
t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r);
t3 += t2;
de = mask*bornForce*t3*rInverse;
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float3 posq[][],
float2 scaledAtomicRadii[][],
float bornForceFactor[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
//float4 jBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// constants
const float I_Unroll = 4.0f;
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ];
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce1.xyz = float3( i1BornForceFactor, i1ScaledAtomicRadii, i1AtomicRadii );
// etch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ];
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce2.xyz = float3( i2BornForceFactor, i2ScaledAtomicRadii, i2AtomicRadii );
// etch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ];
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce3.xyz = float3( i3BornForceFactor, i3ScaledAtomicRadii, i3AtomicRadii );
// etch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ];
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce4.xyz = float3( i4BornForceFactor, i4ScaledAtomicRadii, i4AtomicRadii );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j1pos = posq[ jAtom ];
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j2pos = posq[ jAtom ];
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j3pos = posq[ jAtom ];
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j4pos = posq[ jAtom ];
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 2
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 3
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 4
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
/* ---------------------------------------------------------------------------------------
Calculate Born radii and Obc chain derivative term
numberOfAtoms: no. of atoms
streamWidth: atom stream width
forces input forces
atomicRadii atomic radii
bornRadii output Born radii
obcChain output Obc chain derivative
--------------------------------------------------------------------------------------- */
kernel void kObcBornRadii( float numberOfAtoms, float streamWidth, float4 forces<>,
float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){
// ---------------------------------------------------------------------------------------
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float2 iAtom;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
}
kernel float4 scalar_force_CDLJMerge( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr*2.39f;
return f;
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2Cdlj4( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth,
float jstreamWidth, float epsfac, float4 posq[][],
float atomicRadii[][], float scaledAtomicRadii[][],
float bornForceFactor[][], float4 isigeps[][],
float4 sigma[][], float4 epsilon[][], float excl[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
float i1q, i2q, i3q, i4q;
float i1sig, i2sig, i3sig, i4sig;
float i1eps, i2eps, i3eps, i4eps;
float j1q, j2q, j3q, j4q;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2, r2Masked;
float3 pad;
float4 exclmask;
float2 iatom;
float2 jstrindex;
float linind;
float4 jsig, jeps;
float4 jQ,qq;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, fs, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
// ---------------------------------------------------------------------------------------
// constants
const float I_Unroll = 4.0f;
const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// fetch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ].xyz;
i1q = posq[ iAtom ].w;
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = atomicRadii[ iAtom ];
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i1sig = jstrindex.x;
i1eps = jstrindex.y;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ].xyz;
i2q = posq[ iAtom ].w;
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = atomicRadii[ iAtom ];
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i2sig = jstrindex.x;
i2eps = jstrindex.y;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ].xyz;
i3q = posq[ iAtom ].w;
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = atomicRadii[ iAtom ];
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i3sig = jstrindex.x;
i3eps = jstrindex.y;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ].xyz;
i4q = posq[ iAtom ].w;
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = atomicRadii[ iAtom ];
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i4sig = jstrindex.x;
i4eps = jstrindex.y;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
whichRep = floor( forceIndex / roundedUpAtoms );
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
exclind.x = iAtomLinearIndex;
exclind.y = jStart*streamWidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jAtom.x + jAtom.y*streamWidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth);
jstrindex.x = linind - jstrindex.y*jstreamWidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = atomicRadii[ jAtom ];
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j1pos = posq[ jAtom ].xyz;
j1q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = atomicRadii[ jAtom ];
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j2pos = posq[ jAtom ].xyz;
j2q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = atomicRadii[ jAtom ];
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j3pos = posq[ jAtom ].xyz;
j3q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = atomicRadii[ jAtom ];
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j4pos = posq[ jAtom ].xyz;
j4q = posq[ jAtom ].w;
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
jQ = float4( j1q, j2q, j3q, j4q );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
sig = i1sig + jsig;
eps = i1eps * jeps;
qq = i1q*jQ;
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 2
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i2sig + jsig;
eps = i2eps * jeps;
qq = i2q*jQ;
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 3
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i3sig + jsig;
eps = i3eps * jeps;
qq = i3q*jQ;
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 4
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i4sig + jsig;
eps = i4eps * jeps;
qq = i4q*jQ;
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2Cdlj4X( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth,
float jstreamWidth, float epsfac, float4 posq[][],
float atomicRadii[][], float scaledAtomicRadii[][],
float bornForceFactor[][], float4 isigeps[][],
float4 sigma[][], float4 epsilon[][], float excl[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
//float4 jBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
float i1q, i2q, i3q, i4q;
float i1sig, i2sig, i3sig, i4sig;
float i1eps, i2eps, i3eps, i4eps;
float j1q, j2q, j3q, j4q;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2, r2Masked;
float3 pad;
float4 exclmask;
float2 iatom;
float2 jstrindex;
float linind;
float4 jsig, jeps;
float4 jQ,qq;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, fs, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float sum;
// ---------------------------------------------------------------------------------------
// constants
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 frac4 = float4( 0.2f, 0.2f, 0.2f, 0.2f );
const float I_Unroll = 4.0f;
const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// fetch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ].xyz;
i1q = posq[ iAtom ].w;
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = atomicRadii[ iAtom ];
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i1sig = jstrindex.x;
i1eps = jstrindex.y;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ].xyz;
i2q = posq[ iAtom ].w;
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = atomicRadii[ iAtom ];
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i2sig = jstrindex.x;
i2eps = jstrindex.y;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ].xyz;
i3q = posq[ iAtom ].w;
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = atomicRadii[ iAtom ];
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i3sig = jstrindex.x;
i3eps = jstrindex.y;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ].xyz;
i4q = posq[ iAtom ].w;
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = atomicRadii[ iAtom ];
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i4sig = jstrindex.x;
i4eps = jstrindex.y;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
whichRep = floor( forceIndex / roundedUpAtoms );
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
exclind.x = iAtomLinearIndex;
exclind.y = jStart*streamWidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jAtom.x + jAtom.y*streamWidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth);
jstrindex.x = linind - jstrindex.y*jstreamWidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = atomicRadii[ jAtom ];
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j1pos = posq[ jAtom ].xyz;
j1q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = atomicRadii[ jAtom ];
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j2pos = posq[ jAtom ].xyz;
j2q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = atomicRadii[ jAtom ];
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j3pos = posq[ jAtom ].xyz;
j3q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = atomicRadii[ jAtom ];
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j4pos = posq[ jAtom ].xyz;
j4q = posq[ jAtom ].w;
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
jQ = float4( j1q, j2q, j3q, j4q );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
/*
sig = i1sig + jsig;
eps = i1eps * jeps;
qq = i1q*jQ;
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x ) + floor( exclmask.y ) + floor( exclmask.z ) + floor( exclmask.w );
if( sum > 0.25f ){
//if( exclind.y < 4.9f && exclind.y > 3.1f ){
//if( exclind.y < 0.9f ){
//if( exclind.y < 1.9f && exclind.y > 0.1f ){
//if( exclind.y < 2.9f && exclind.y > 1.1f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce1.x += 1.0f;
bornForce1.y += exclusions;
bornForce1.z += sum;
bornForce1.w += exclmask.w;
//bornForce1 = exclmask;
}
// ---------------------------------------------------------------------------------------
// i = 2
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce2.x += 1.0f;
bornForce2.y += exclusions;
bornForce2.z += sum;
bornForce2.w += exclind.y;
}
/*
sig = i2sig + jsig;
eps = i2eps * jeps;
qq = i2q*jQ;
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
// i = 3
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce3.x += 1.0f;
bornForce3.y += exclusions;
bornForce3.z += sum;
bornForce3.w += exclind.y;
}
/*
sig = i3sig + jsig;
eps = i3eps * jeps;
qq = i3q*jQ;
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
// i = 4
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce4.x += 1.0f;
bornForce4.y += exclusions;
bornForce4.z += sum;
bornForce4.w += exclind.y;
}
/*
sig = i4sig + jsig;
eps = i4eps * jeps;
qq = i4q*jQ;
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
/*
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
*/
// ---------------------------------------------------------------------------------------
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
platforms/brook/src/gpu/kangles.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Harmonic angles kernel
//Input is a stream of triplets i, j, k
//parms is float2( theta0, kA )
//Output is three streams of forces fi, fj, fk
//Again, this is kept simple for now, can be optimized
//later as necessary
kernel void kangles_harmonic(
float xstrwidth,
float3 atoms<>,
float2 parms<>,
float4 posq[][],
out float3 fi<>,
out float3 fj<>,
out float3 fk<>
) {
float theta;
float dx, dx2, fs;
float st, dvdt, cik, sth, nrkj2, nrij2, cii, ckk, costheta, sintheta;
float rij2, rkj2;
float2 idx;
float2 ai, aj, ak;
float3 xi, xj, xk, rij, rkj;
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
rij = posq[ ai ].xyz - posq[ aj ].xyz; //3
rkj = posq[ ak ].xyz - posq[ aj ].xyz; //3
rij2 = dot( rij, rij ); //5
rkj2 = dot( rkj, rkj ); //5
costheta = dot( rij, rkj ) / sqrt( rij2 * rkj2 ); //8
costheta = clamp( costheta, -1.0, 1.0 );
theta = acos( costheta ); //1 flop, ouch
sintheta = sqrt( 1 - costheta * costheta ); //3
dx = theta - parms.x; //1
dx2 = dx * dx; //1
/*scalar force = dv/dtheta*/
fs = -parms.y * dx; //1
st = fs / sintheta; //1
st = clamp( st, -1000000.0, 1000000.0 ); //Does this work on the gpu for st=inf?
sth = st * costheta; //1
nrkj2 = dot( rkj, rkj ); //5
nrij2 = dot( rij, rij ); //5
cik = st * rsqrt( nrkj2 * nrij2 ); //3
cii = sth / nrij2; //1
ckk = sth / nrkj2; //1
fi = -( cik * rkj - cii * rij ); //7
fk = -( cik * rij - ckk * rkj ); //7
fj = -fi - fk; //3
//Total flops: 64
}
platforms/brook/src/gpu/kcommon.h
0 → 100644
View file @
cb130f92
#ifndef __KCOMMON_H__
#define __KCOMMON_H__
void
kgetxyz
(
::
brook
::
stream
instr
,
::
brook
::
stream
outstr
);
void
kzerof3
(
::
brook
::
stream
outstr
);
void
kzerof4
(
::
brook
::
stream
outstr
);
void
kzerof4
(
::
brook
::
stream
outstr
);
void
ksetf4
(
const
float4
val
,
::
brook
::
stream
outstr
);
void
kadd3
(
::
brook
::
stream
instr
,
::
brook
::
stream
outstr
);
void
ksetStr3
(
::
brook
::
stream
instr
,
::
brook
::
stream
outstr
);
#endif // __KCOMMON_H__
platforms/brook/src/gpu/kcommonOrig.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Kernel to set the xyz components of a float4 stream
//Used for changing the coordinates without changing
//the charges in strPosQ
kernel void ksetxyz( float3 instr<>, float4 before<>, out float4 after<> ) {
after.xyz = instr;
after.w = before.w;
}
//Inverse of above
kernel void kgetxyz( float4 instr<>, out float3 outstr<> ) {
outstr = instr.xyz;
}
//Zeroes out a stream
kernel void kzerof3( out float3 outstr<> ) {
outstr = float3( 0.0, 0.0, 0.0 );
}
//Zeros out a stream
kernel void kzerof4( out float4 outstr<> ) {
outstr = float4( 0.0, 0.0, 0.0, 0.0 );
}
kernel void ksetf4( float4 val, out float4 outstr<> ) {
outstr = val;
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment