Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
a9054686
Commit
a9054686
authored
Oct 06, 2010
by
Mark Friedrichs
Browse files
Mods for direct PME
parent
01260070
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
833 additions
and
920 deletions
+833
-920
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+18
-0
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+30
-0
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+12
-0
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+31
-2
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+3
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
...cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+5
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
...a/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+5
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
...eba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+12
-12
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
...src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+46
-26
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
.../src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+217
-375
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
...ms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+58
-54
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
...rms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+175
-336
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+50
-10
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
...a/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+126
-101
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
+6
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
...platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+38
-0
plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
...amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+0
-1
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
View file @
a9054686
...
...
@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
log
=
NULL
;
contextImpl
=
NULL
;
gpuInitialized
=
false
;
applyCutoff
=
0
;
multipoleForceCount
=
0
;
}
AmoebaCudaData
::~
AmoebaCudaData
()
{
...
...
@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
return
;
}
void
AmoebaCudaData
::
incrementMultipoleForceCount
(
void
)
{
multipoleForceCount
++
;
}
int
AmoebaCudaData
::
getMultipoleForceCount
(
void
)
const
{
return
multipoleForceCount
;
}
void
AmoebaCudaData
::
setApplyCutoff
(
int
inputApplyCutoff
)
{
applyCutoff
=
inputApplyCutoff
;
}
int
AmoebaCudaData
::
getApplyCutoff
(
void
)
const
{
return
applyCutoff
;
}
}
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
View file @
a9054686
...
...
@@ -139,11 +139,41 @@ public:
*/
void
setContextImpl
(
void
*
contextImpl
);
/**
* Get multipole force count
*
* @return multipole force count
*/
int
getMultipoleForceCount
(
void
)
const
;
/**
* Get multipole force count
*
* @return multipole force count
*/
void
incrementMultipoleForceCount
(
void
);
/**
* Get multipole force count
*
* @return multipole force count
*/
int
getApplyCutoff
(
)
const
;
/**
* Get multipole force count
*
* @return multipole force count
*/
void
setApplyCutoff
(
int
applyCutoff
);
private:
CudaPlatform
::
PlatformData
&
cudaPlatformData
;
amoebaGpuContext
amoebaGpu
;
bool
hasAmoebaBonds
,
hasAmoebaGeneralizedKirkwood
,
hasAmoebaMultipole
;
int
multipoleForceCount
;
int
applyCutoff
;
KernelImpl
*
localForceKernel
;
unsigned
int
kernelCount
;
void
*
contextImpl
;
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
a9054686
...
...
@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
static
void
computeAmoebaMultipoleForce
(
AmoebaCudaData
&
data
)
{
amoebaGpuContext
gpu
=
data
.
getAmoebaGpu
();
if
(
data
.
getMultipoleForceCount
()
==
0
){
gpuCopyInteractingWorkUnit
(
gpu
);
}
if
(
data
.
getApplyCutoff
()
&&
(
data
.
getMultipoleForceCount
()
%
100
)
==
0
){
gpuReorderAtoms
(
gpu
->
gpuContext
);
}
data
.
incrementMultipoleForceCount
();
data
.
initializeGpu
();
if
(
0
&&
data
.
getLog
()
){
...
...
@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize
=
pmeGridDimension
[
2
];
}
gpuSetAmoebaPMEParameters
(
data
.
getAmoebaGpu
(),
(
float
)
alpha
,
xsize
,
ysize
,
zsize
);
data
.
setApplyCutoff
(
1
);
amoebaGpuContext
amoebaGpu
=
data
.
getAmoebaGpu
();
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
sim
.
nonbondedCutoffSqr
=
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
();
gpu
->
sim
.
nonbondedMethod
=
PARTICLE_MESH_EWALD
;
}
data
.
getAmoebaGpu
()
->
gpuContext
->
forces
.
push_back
(
new
ForceInfo
(
force
));
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
View file @
a9054686
...
...
@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(
void
)
fprintf
(
log
,
" sqrtPi %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
sqrtPi
);
(
void
)
fprintf
(
log
,
" alpha Ewald %15.7e
\n
"
,
gpu
->
sim
.
alphaEwald
);
(
void
)
fprintf
(
log
,
" PME grid dimensions %6d %6d %6d
\n
"
,
gpu
->
sim
.
pmeGridSize
.
x
,
gpu
->
sim
.
pmeGridSize
.
y
,
gpu
->
sim
.
pmeGridSize
.
z
);
(
void
)
fprintf
(
log
,
"
cutoffDistance2
%15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
cutoffDistance2
);
(
void
)
fprintf
(
log
,
"
nonbondedCutoffSqr
%15.7e
\n
"
,
gpu
->
sim
.
nonbondedCutoffSqr
);
(
void
)
fprintf
(
log
,
" electric %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
electric
);
(
void
)
fprintf
(
log
,
" box %15.7e %15.7e %15.7e
\n
"
,
gpu
->
sim
.
periodicBoxSizeX
,
gpu
->
sim
.
periodicBoxSizeY
,
gpu
->
sim
.
periodicBoxSizeZ
);
(
void
)
fprintf
(
log
,
" gkc %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
gkc
);
...
...
@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
AMOEBA_NO_CUTOFF
,
AMOEBA_PARTICLE_MESH_EWALD
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
amoebaGpu
->
amoebaSim
.
cutoffDistance2
=
cutoffDistance
*
cutoffDistance
;
amoebaGpu
->
amoebaSim
.
sqrtPi
=
std
::
sqrt
(
3.14159265358
f
);
amoebaGpu
->
amoebaSim
.
electric
=
electricConstant
;
amoebaGpu
->
gpuContext
->
sim
.
alphaEwald
=
alphaEwald
;
...
...
@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
}
}
/**---------------------------------------------------------------------------------------
Track iterations for MI dipoles
@param amoebaGpu amoebaGpuContext reference
@param iteration MI iteration
--------------------------------------------------------------------------------------- */
void
gpuCopyInteractingWorkUnit
(
amoebaGpuContext
amoebaGpu
){
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
psInteractingWorkUnit
->
Download
();
gpu
->
psWorkUnit
->
Download
();
amoebaGpu
->
psWorkUnit
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"gpuCopyInteractingWorkUnit called -- to be removed.
\n
"
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
psInteractingWorkUnit
->
_length
;
ii
++
){
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
]
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
gpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
]
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
}
gpu
->
psInteractingWorkUnit
->
Upload
();
gpu
->
psWorkUnit
->
Upload
();
// ---------------------------------------------------------------------------------------
}
#undef AMOEBA_DEBUG
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
View file @
a9054686
...
...
@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
unsigned
int
numberOfAtoms
;
// number of atoms
unsigned
int
paddedNumberOfAtoms
;
// padded number of atoms
float
cutoffDistance2
;
// cutoff distance squared for PME
//
float cutoffDistance2; // cutoff distance squared for PME
float
sqrtPi
;
// sqrt(PI)
float
scalingDistanceCutoff
;
// scaling cutoff
float2
*
pDampingFactorAndThole
;
// Thole & damping factors
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
View file @
a9054686
...
...
@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
extern
"C"
void
gpuSetAmoebaBondOffsets
(
amoebaGpuContext
gpu
);
extern
"C"
void
gpuCopyInteractingWorkUnit
(
amoebaGpuContext
gpu
);
/*
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
View file @
a9054686
...
...
@@ -44,6 +44,11 @@ struct FixedFieldParticle {
float
gkField
[
3
];
#endif
#ifdef INCLUDE_FIXED_FIELD_BUFFERS
float
tempBuffer
[
3
];
float
tempBufferP
[
3
];
#endif
};
__device__
static
void
loadFixedFieldShared
(
struct
FixedFieldParticle
*
sA
,
unsigned
int
atomI
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
View file @
a9054686
...
...
@@ -24,6 +24,11 @@ struct MutualInducedParticle {
float
fieldS
[
3
];
float
fieldPolarS
[
3
];
#endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
float
tempBuffer
[
3
];
float
tempBufferP
[
3
];
#endif
};
__device__
static
void
loadMutualInducedShared
(
MutualInducedParticle
*
sA
,
unsigned
int
atomI
)
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
View file @
a9054686
...
...
@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
multipole
[
8
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
2
];
multipole
[
9
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
5
];
float
*
phi
=
&
cAmoebaSim
.
pPhi
[
20
*
i
];
cAmoebaSim
.
pTorque
[
3
*
i
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phi
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
[
3
]
cAmoebaSim
.
pTorque
[
3
*
i
]
=
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phi
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
zscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
yscale
*
yscale
*
phi
[
7
]
+
multipole
[
9
]
*
xscale
*
yscale
*
phi
[
5
]
-
multipole
[
7
]
*
yscale
*
zscale
*
phi
[
8
]
-
multipole
[
9
]
*
xscale
*
zscale
*
phi
[
6
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phi
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
[
1
]
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phi
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
zscale
*
zscale
*
phi
[
8
]
+
multipole
[
7
]
*
zscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
xscale
*
zscale
*
phi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phi
[
4
]
-
multipole
[
9
]
*
yscale
*
yscale
*
phi
[
7
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phi
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
[
2
]
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phi
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
yscale
*
yscale
*
phi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phi
[
4
]
+
multipole
[
9
]
*
yscale
*
zscale
*
phi
[
8
]
-
multipole
[
7
]
*
xscale
*
yscale
*
phi
[
5
]
-
multipole
[
8
]
*
zscale
*
zscale
*
phi
[
9
]);
...
...
@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
f
.
y
*=
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
y
*
cSim
.
invPeriodicBoxSizeY
;
f
.
z
*=
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
float4
force
=
cSim
.
pForce4
[
i
];
force
.
x
+
=
f
.
x
;
force
.
y
+
=
f
.
y
;
force
.
z
+
=
f
.
z
;
force
.
x
-
=
f
.
x
;
force
.
y
-
=
f
.
y
;
force
.
z
-
=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
...
...
@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole
[
8
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
2
];
multipole
[
9
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
5
];
float
*
phidp
=
&
cAmoebaSim
.
pPhidp
[
20
*
i
];
cAmoebaSim
.
pTorque
[
3
*
i
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
cAmoebaSim
.
pTorque
[
3
*
i
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
zscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
yscale
*
yscale
*
phidp
[
7
]
+
multipole
[
9
]
*
xscale
*
yscale
*
phidp
[
5
]
-
multipole
[
7
]
*
yscale
*
zscale
*
phidp
[
8
]
-
multipole
[
9
]
*
xscale
*
zscale
*
phidp
[
6
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
zscale
*
zscale
*
phidp
[
8
]
+
multipole
[
7
]
*
zscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
xscale
*
zscale
*
phidp
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phidp
[
4
]
-
multipole
[
9
]
*
yscale
*
yscale
*
phidp
[
7
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
yscale
*
yscale
*
phidp
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phidp
[
4
]
+
multipole
[
9
]
*
yscale
*
zscale
*
phidp
[
8
]
-
multipole
[
7
]
*
xscale
*
yscale
*
phidp
[
5
]
-
multipole
[
8
]
*
zscale
*
zscale
*
phidp
[
9
]);
...
...
@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
f
.
y
*=
0.5
f
*
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
y
*
cSim
.
invPeriodicBoxSizeY
;
f
.
z
*=
0.5
f
*
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
float4
force
=
cSim
.
pForce4
[
i
];
force
.
x
+
=
f
.
x
;
force
.
y
+
=
f
.
y
;
force
.
z
+
=
f
.
z
;
force
.
x
-
=
f
.
x
;
force
.
y
-
=
f
.
y
;
force
.
z
-
=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
cAmoebaSim
.
electric
*
energy
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
View file @
a9054686
...
...
@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
float
torque
[
3
];
float
padding
;
float
tempForce
[
3
];
float
tempTorque
[
3
];
};
__device__
void
sumTempBuffer
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
){
atomI
.
tempForce
[
0
]
+=
atomJ
.
tempForce
[
0
];
atomI
.
tempForce
[
1
]
+=
atomJ
.
tempForce
[
1
];
atomI
.
tempForce
[
2
]
+=
atomJ
.
tempForce
[
2
];
atomI
.
tempTorque
[
0
]
+=
atomJ
.
tempTorque
[
0
];
atomI
.
tempTorque
[
1
]
+=
atomJ
.
tempTorque
[
1
];
atomI
.
tempTorque
[
2
]
+=
atomJ
.
tempTorque
[
2
];
}
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...
...
@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
float
uiy
=
0.5
f
*
(
atomI
.
inducedDipole
[
1
]
+
atomI
.
inducedDipoleP
[
1
]);
float
uiz
=
0.5
f
*
(
atomI
.
inducedDipole
[
2
]
+
atomI
.
inducedDipoleP
[
2
]);
atomI
.
torque
[
0
]
-
=
term
*
(
atomI
.
labFrameDipole
[
1
]
*
uiz
-
atomI
.
labFrameDipole
[
2
]
*
uiy
);
atomI
.
torque
[
1
]
-
=
term
*
(
atomI
.
labFrameDipole
[
2
]
*
uix
-
atomI
.
labFrameDipole
[
0
]
*
uiz
);
atomI
.
torque
[
2
]
-
=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
atomI
.
torque
[
0
]
+
=
term
*
(
atomI
.
labFrameDipole
[
1
]
*
uiz
-
atomI
.
labFrameDipole
[
2
]
*
uiy
);
atomI
.
torque
[
1
]
+
=
term
*
(
atomI
.
labFrameDipole
[
2
]
*
uix
-
atomI
.
labFrameDipole
[
0
]
*
uiz
);
atomI
.
torque
[
2
]
+
=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
...
...
@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
gfr
[
8
],
gfri
[
7
];
float
gti
[
7
],
gtri
[
7
];
float
conversionFactor
=
(
cAmoebaSim
.
electric
/
cAmoebaSim
.
dielec
);
float
conversionFactor
=
(
-
cAmoebaSim
.
electric
/
cAmoebaSim
.
dielec
);
// set the permanent multipole and induced dipole values;
...
...
@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
zr
-=
floor
(
zr
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
float
r
=
sqrt
(
r2
);
float
ck
=
atomJ
.
q
;
...
...
@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e
=
e
-
(
1.0
f
-
scalingFactors
[
MScaleIndex
])
*
erl
;
ei
=
ei
-
erli
;
*
energy
=
conversionFactor
*
(
e
+
ei
);
*
energy
=
-
conversionFactor
*
(
e
+
ei
);
// increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell;
...
...
@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
)
),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
2
);
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
),
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
]
,
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
else
{
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
// gpu->sim.pInteractingWorkUnit,
// amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished PmeDirectElectrostatic kernel execution
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
int
maxPrint
=
1400
;
int
maxPrint
=
5
;
float
conversion
=
1.0
f
/
41.84
f
;
float
forceSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
...
...
@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
1
){
if
(
0
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"DebugElec
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
View file @
a9054686
...
...
@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned
int
x
;
unsigned
int
y
;
bool
bExclusionFlag
;
int
dScaleMask
;
int2
pScaleMask
;
int2
mScaleMask
;
// Extract cell coordinates
...
...
@@ -99,49 +102,16 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
loadPmeDirectElectrostaticShared
(
&
(
sA
[
threadIdx
.
x
]),
atomI
);
if
(
!
bExclusionFlag
)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
force
[
3
];
float
torque
[
2
][
3
];
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
j
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle
.
force
[
0
]
+=
mask
?
force
[
0
]
:
0
.
0
f
;
localParticle
.
force
[
1
]
+=
mask
?
force
[
1
]
:
0
.
0
f
;
localParticle
.
force
[
2
]
+=
mask
?
force
[
2
]
:
0
.
0
f
;
localParticle
.
torque
[
0
]
+=
mask
?
torque
[
0
][
0
]
:
0
.
0
f
;
localParticle
.
torque
[
1
]
+=
mask
?
torque
[
0
][
1
]
:
0
.
0
f
;
localParticle
.
torque
[
2
]
+=
mask
?
torque
[
0
][
2
]
:
0
.
0
f
;
totalEnergy
+=
mask
?
0
.
5
*
energy
:
0
.
0
f
;
}
}
else
// bExclusion
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
scalingFactors
[
DScaleIndex
]
=
scalingFactors
[
PScaleIndex
]
=
scalingFactors
[
MScaleIndex
]
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
...
...
@@ -153,9 +123,12 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// set scale factors
if
(
bExclusionFlag
)
{
getMaskedDScaleFactor
(
j
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
j
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
j
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
}
// force
...
...
@@ -229,8 +202,7 @@ if( atomI == targetAtom ){
}
#endif
}
}
}
// end of j-loop
// include self energy and self torque
...
...
@@ -281,11 +253,15 @@ if( atomI == targetAtom ){
outputTorque
[
offset
+
2
]
=
localParticle
.
torque
[
2
];
#endif
}
else
{
if
(
lasty
!=
y
)
{
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
// load shared data
loadPmeDirectElectrostaticShared
(
&
(
sA
[
threadIdx
.
x
]),
(
y
+
tgx
)
);
...
...
@@ -300,133 +276,43 @@ if( atomI == targetAtom ){
sA
[
threadIdx
.
x
].
torque
[
1
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
torque
[
2
]
=
0
.
0
f
;
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
force
[
3
];
float
torque
[
2
][
3
];
unsigned
int
atomJ
=
y
+
tj
;
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
tj
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
(
atomJ
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add force and torque to atom I due atom J
localParticle
.
force
[
0
]
+=
mask
?
force
[
0
]
:
0
.
0
f
;
localParticle
.
force
[
1
]
+=
mask
?
force
[
1
]
:
0
.
0
f
;
localParticle
.
force
[
2
]
+=
mask
?
force
[
2
]
:
0
.
0
f
;
localParticle
.
torque
[
0
]
+=
mask
?
torque
[
0
][
0
]
:
0
.
0
f
;
localParticle
.
torque
[
1
]
+=
mask
?
torque
[
0
][
1
]
:
0
.
0
f
;
localParticle
.
torque
[
2
]
+=
mask
?
torque
[
0
][
2
]
:
0
.
0
f
;
totalEnergy
+=
mask
?
energy
:
0
.
0
f
;
// add force and torque to atom J due atom I
psA
[
tj
].
force
[
0
]
-=
mask
?
force
[
0
]
:
0
.
0
f
;
psA
[
tj
].
force
[
1
]
-=
mask
?
force
[
1
]
:
0
.
0
f
;
psA
[
tj
].
force
[
2
]
-=
mask
?
force
[
2
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
0
]
+=
mask
?
torque
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
1
]
+=
mask
?
torque
[
1
][
1
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
2
]
+=
mask
?
torque
[
1
][
2
]
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
float
forceSign
=
(
atomI
==
targetAtom
)
?
1
.
0
f
:
-
1
.
0
f
;
float
blockId
=
2
.
0
f
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
atomJ
;
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceSign
*
force
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceSign
*
force
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceSign
*
force
[
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexI
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
mask
?
energy
:
0
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexJ
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexJ
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexJ
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
(
float
)
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
}
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
else
// bExclusion
if
(
bExclusionFlag
)
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
scalingFactors
[
DScaleIndex
]
=
scalingFactors
[
PScaleIndex
]
=
scalingFactors
[
MScaleIndex
]
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
unsigned
int
atomJ
=
y
+
jIdx
;
float
force
[
3
];
float
torque
[
2
][
3
];
unsigned
int
atomJ
=
y
+
tj
;
// set scale factors
getMaskedDScaleFactor
(
tj
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
tj
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
tj
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
if
(
bExclusionFlag
)
{
getMaskedDScaleFactor
(
jIdx
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
jIdx
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
jIdx
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
}
// force
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
t
j
],
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
j
Idx
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
,
pullBack
#endif
);
...
...
@@ -448,101 +334,58 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom J due atom I
psA
[
tj
].
force
[
0
]
-=
mask
?
force
[
0
]
:
0
.
0
f
;
psA
[
tj
].
force
[
1
]
-=
mask
?
force
[
1
]
:
0
.
0
f
;
psA
[
tj
].
force
[
2
]
-=
mask
?
force
[
2
]
:
0
.
0
f
;
if
(
flags
==
0xFFFFFFFF
){
psA
[
tj
].
t
or
qu
e
[
0
]
+
=
mask
?
t
or
que
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
t
or
qu
e
[
1
]
+
=
mask
?
t
or
qu
e
[
1
]
[
1
]
:
0
.
0
f
;
psA
[
tj
].
t
or
qu
e
[
2
]
+
=
mask
?
t
or
que
[
1
][
2
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
0
]
-
=
mask
?
f
or
ce
[
0
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
1
]
-
=
mask
?
f
or
c
e
[
1
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
2
]
-
=
mask
?
f
or
ce
[
2
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
0
]
+=
mask
?
torque
[
1
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
1
]
+=
mask
?
torque
[
1
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
2
]
+=
mask
?
torque
[
1
][
2
]
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
float
forceSign
=
(
atomI
==
targetAtom
)
?
1
.
0
f
:
-
1
.
0
f
;
float
blockId
=
3
.
0
f
;
}
else
{
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
atomJ
;
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceSign
*
force
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceSign
*
force
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceSign
*
force
[
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
blockId
;
psA
[
threadIdx
.
x
].
tempForce
[
0
]
=
mask
?
0
.
0
f
:
force
[
0
];
psA
[
threadIdx
.
x
].
tempForce
[
1
]
=
mask
?
0
.
0
f
:
force
[
1
];
psA
[
threadIdx
.
x
].
tempForce
[
2
]
=
mask
?
0
.
0
f
:
force
[
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexI
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
energy
;
psA
[
threadIdx
.
x
].
tempTorque
[
0
]
=
mask
?
0
.
0
f
:
torque
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempTorque
[
1
]
=
mask
?
0
.
0
f
:
torque
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempTorque
[
2
]
=
mask
?
0
.
0
f
:
torque
[
1
][
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexJ
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexJ
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexJ
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
(
float
)
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[DScaleIndex];
debugArray[index].y = dScaleVal;
debugArray[index].z = scalingFactors[PScaleIndex];
debugArray[index].w = pScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[MScaleIndex];
debugArray[index].y = mScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomI];
debugArray[index].y = labFrameDipole[3*atomI+1];
debugArray[index].z = labFrameDipole[3*atomI+2];
debugArray[index].w = 25.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomJ];
debugArray[index].y = labFrameDipole[3*atomJ+1];
debugArray[index].z = labFrameDipole[3*atomJ+2];
debugArray[index].w = 26.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = jDipole[0];
debugArray[index].y = jDipole[1];
debugArray[index].z = jDipole[2];
debugArray[index].w = 27.0f;
#endif
}
#endif
if
(
tgx
==
0
)
{
psA
[
jIdx
].
force
[
0
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
0
];
psA
[
jIdx
].
force
[
1
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
1
];
psA
[
jIdx
].
force
[
2
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
2
];
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
psA
[
jIdx
].
torque
[
0
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
0
];
psA
[
jIdx
].
torque
[
1
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
1
];
psA
[
jIdx
].
torque
[
2
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
2
];
}
}
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
// end of j-loop
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float
of
;
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
...
...
@@ -596,7 +439,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
of
+=
sA
[
threadIdx
.
x
].
torque
[
2
];
outputTorque
[
offset
+
2
]
=
of
;
#else
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
outputForce
[
offset
]
=
localParticle
.
force
[
0
];
...
...
@@ -617,13 +460,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
outputTorque
[
offset
+
1
]
=
sA
[
threadIdx
.
x
].
torque
[
1
];
outputTorque
[
offset
+
2
]
=
sA
[
threadIdx
.
x
].
torque
[
2
];
#endif
#endif
lasty
=
y
;
}
}
// end of pInteractionFlag block
}
pos
++
;
}
//printf( "Hello thread: %d %d %d %d\n", blockIdx.x * blockDim.x + threadIdx.x, blockIdx.x, blockDim.x, threadIdx.x );
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
View file @
a9054686
...
...
@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
...
...
@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
// Reduce field
const
float
term
=
(
4.0
f
/
3.0
f
)
*
(
cSim
.
alphaEwald
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
)
/
cAmoebaSim
.
sqrtPi
;
//const float term = 0.0f;
while
(
pos
<
fieldComponents
)
{
...
...
@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
atomI
.
tempBuffer
[
1
]
+=
atomJ
.
tempBuffer
[
1
];
atomI
.
tempBuffer
[
2
]
+=
atomJ
.
tempBuffer
[
2
];
atomI
.
tempBufferP
[
0
]
+=
atomJ
.
tempBufferP
[
0
];
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
__device__
void
calculateFixedFieldRealSpacePairIxn_kernel
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
,
float
dscale
,
float
pscale
,
float
fields
[
4
][
3
]
#ifdef AMOEBA_DEBUG
...
...
@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
yr
-=
floor
(
yr
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
zr
-=
floor
(
zr
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
float
r
=
sqrtf
(
r2
);
// calculate the error function damping terms
...
...
@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
fields
[
0
][
0
]
=
fim
[
0
]
-
fid
[
0
];
fields
[
0
][
1
]
=
fim
[
1
]
-
fid
[
1
];
...
...
@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields
[
2
][
2
]
=
0.0
f
;
fields
[
3
][
2
]
=
0.0
f
;
}
#ifdef AMOEBA_DEBUG
pullBack
[
0
].
x
=
xr
;
pullBack
[
0
].
y
=
yr
;
...
...
@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){
static
void
cudaComputeAmoebaPmeDirectFixedEField
(
amoebaGpuContext
amoebaGpu
)
{
static
unsigned
int
threadsPerBlock
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
...
...
@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
0
;
int
maxPrint
=
3002
;
amoebaGpu
->
psE_Field
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Recip EFields In
\n
"
);
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
int
isNan
=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
]
);
isNan
+=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
1
]
);
isNan
+=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] %s
\n
"
,
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
2
],
(
isNan
?
"XXX"
:
""
)
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Recip EFields End
\n
"
);
unsigned
int
targetAtom
=
354
;
#endif
kClearFields_3
(
amoebaGpu
,
2
);
// on first pass, set threads/block
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondT
hreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
]
,
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
t
hreadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
#endif
}
else
{
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR
(
"kCalculateAmoebaPmeDirectFixedE_Field_kernel"
);
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReducePmeDirectE_Fields
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
bOutputBufferPerWarp
);
(
void
)
fflush
(
amoebaGpu
->
log
);
...
...
@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
*/
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E-field (includes self term)"
);
int
maxPrint
=
3002
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
...
...
@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
debugArray
->
Download
();
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
amoebaGpu
->
gpuContext
->
psPosq4
->
Download
();
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
if
(
fabs
(
debugArray
->
_pSysStream
[
0
][
jj
+
paddedNumberOfAtoms
].
x
)
>
0.0
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d PmeFixedEField
\n
"
,
jj
);
for
(
int
kk
=
0
;
kk
<
10
;
kk
++
){
for
(
int
kk
=
0
;
kk
<
6
;
kk
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %16.9e]
\n
"
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
x
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
y
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
z
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
w
);
debugIndex
+=
paddedNumberOfAtoms
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p
\n
"
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
x
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
y
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
z
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
x
)
/
5.50
f
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
y
)
/
5.50
f
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
z
)
/
5.50
f
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
...
...
@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEField"
,
fileId
,
outputVector
);
}
delete
debugArray
;
}
#endif
if
(
0
){
if
(
1
){
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
0
);
VectorOfDoubleVectors
outputVector
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
View file @
a9054686
...
...
@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
){
#ifdef AMOEBA_DEBUG
int
p
ullIndex
Max
=
1
2
;
int
maxP
ullIndex
=
1
;
float4
pullBack
[
12
];
float
dScaleVal
;
float
pScaleVal
;
#endif
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned
int
x
;
unsigned
int
y
;
bool
bExclusionFlag
;
float
dScaleValue
;
float
pScaleValue
;
int
dScaleMask
;
int2
pScaleMask
;
// extract cell coordinates
...
...
@@ -97,75 +99,31 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
loadFixedFieldShared
(
&
(
sA
[
threadIdx
.
x
]),
atomI
);
if
(
!
bExclusionFlag
)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
ijField
[
4
][
3
];
// load coords, charge, ...
#ifdef AMOEBA_DEBUG
dScaleVal
=
1
.
0
f
;
pScaleVal
=
1
.
0
f
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
j
],
1
.
0
f
,
1
.
0
f
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
match
=
1
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
fieldSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
1
];
fieldSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
2
];
fieldPolarSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
0
];
fieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
2
];
}
}
else
// bExclusion
{
if
(
bExclusionFlag
){
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
dScaleValue
=
pScaleValue
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
// load coords, charge, ...
float
ijField
[
4
][
3
];
float
dScaleValue
;
float
pScaleValue
;
if
(
bExclusionFlag
){
getMaskedDScaleFactor
(
j
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
j
,
pScaleMask
,
&
pScaleValue
);
}
#ifdef AMOEBA_DEBUG
dScaleVal
=
dScaleValue
;
pScaleVal
=
pScaleValue
;
#endif
float
ijField
[
4
][
3
];
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
j
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
...
...
@@ -193,13 +151,6 @@ if( atomI == targetAtom ){
debugArray
[
index
].
z
=
dScaleValue
;
debugArray
[
index
].
w
=
pScaleValue
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
unsigned
int
off
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
tgx
;
debugArray
[
index
].
z
=
-
2
;
debugArray
[
index
].
w
=
(
float
)
off
;
float
flag
=
7
.
0
f
;
for
(
int
ii
=
0
;
ii
<
4
;
ii
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
...
...
@@ -208,8 +159,7 @@ if( atomI == targetAtom ){
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
indices
[
ii
]][
2
];
debugArray
[
index
].
w
=
flag
;
}
for
(
int
pullIndex
=
0
;
pullIndex
<
pullIndexMax
;
pullIndex
++
){
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
...
...
@@ -218,28 +168,9 @@ if( atomI == targetAtom ){
}
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
}
}
// Write results
...
...
@@ -253,12 +184,14 @@ if( atomI == targetAtom ){
load3dArray
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
#endif
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
// flags = 0xFFFFFFFF;
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
// load coordinates, charge, ...
...
...
@@ -270,26 +203,32 @@ if( atomI == targetAtom ){
zeroFixedFieldParticleSharedField
(
&
(
sA
[
threadIdx
.
x
])
);
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
dScaleValue
=
pScaleValue
=
1
.
0
f
;
}
float
ijField
[
4
][
3
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
){
// load coords, charge, ...
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
if
(
bExclusionFlag
){
getMaskedDScaleFactor
(
jIdx
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
jIdx
,
pScaleMask
,
&
pScaleValue
);
}
#ifdef AMOEBA_DEBUG
dScaleVal
=
1
.
0
f
;
pScaleVal
=
1
.
0
f
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
tj
],
1
.
0
f
,
1
.
0
f
,
ijField
float
ijField
[
4
][
3
];
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
t
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
Idx
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -301,205 +240,104 @@ pScaleVal = 1.0f;
fieldPolarSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
2
];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA
[
tj
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
tj
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
tj
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
tj
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
tj
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
#ifdef AMOEBA_DEBUG
if
(
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
2
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
2
:
0
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
unsigned
int
pullBackIndex
=
0
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
pullBackIndex
++
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
if
(
flags
==
0xFFFFFFFF
){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
float
flag
=
8
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
psA
[
jIdx
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
jIdx
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
jIdx
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
psA
[
jIdx
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
jIdx
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
jIdx
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
#if 0
}
else
{
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleValue + 10.0f;
#endif
}
#endif
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
else
// bExclusion
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
if
(
tgx
==
0
)
{
// load coords, charge, ...
float
ijField
[
4
][
3
];
float
dScaleValue
;
float
pScaleValue
;
getMaskedDScaleFactor
(
tj
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
tj
,
pScaleMask
,
&
pScaleValue
);
#ifdef AMOEBA_DEBUG
dScaleVal
=
dScaleValue
;
pScaleVal
=
pScaleValue
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
tj
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
tj
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
0
];
fieldSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
1
];
fieldSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
2
];
fieldPolarSum
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
0
];
fieldPolarSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
2
];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA
[
tj
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
tj
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
tj
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
tj
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
tj
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
psA
[
jIdx
].
eField
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
0
];
psA
[
jIdx
].
eField
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
1
];
psA
[
jIdx
].
eField
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
2
];
psA
[
jIdx
].
eFieldP
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
0
];
psA
[
jIdx
].
eFieldP
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
1
];
psA
[
jIdx
].
eFieldP
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
2
];
}
}
#ifdef AMOEBA_DEBUG
if
(
(
atomI
==
targetAtom
||
(
y
+
t
j
)
==
targetAtom
)
){
if
(
(
atomI
==
targetAtom
||
(
y
+
j
Idx
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
t
j
)
:
atomI
;
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
j
Idx
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
2
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
2
:
0
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
unsigned
int
pullBackIndex
=
0
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
pullBackIndex
++
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
debugArray
[
index
].
y
=
(
float
)
(
y
+
jIdx
);
debugArray
[
index
].
z
=
dScaleValue
;
debugArray
[
index
].
w
=
pScaleValue
;
float
flag
=
9
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
}
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// Write results
}
// j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
...
...
@@ -520,8 +358,9 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
eFieldP
,
outputEFieldPolar
);
#endif
}
// end of pInteractionFlag block
lasty
=
y
;
}
}
// x == y block
pos
++
;
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
a9054686
...
...
@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
atomI
.
tempBuffer
[
1
]
+=
atomJ
.
tempBuffer
[
1
];
atomI
.
tempBuffer
[
2
]
+=
atomJ
.
tempBuffer
[
2
];
atomI
.
tempBufferP
[
0
]
+=
atomJ
.
tempBufferP
[
0
];
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...
...
@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
fields
[
0
][
0
]
=
fimd
[
0
]
-
fid
[
0
];
fields
[
1
][
0
]
=
fkmd
[
0
]
-
fkd
[
0
];
...
...
@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream
<
float
>*
outputArray
,
CUDAStream
<
float
>*
outputPolarArray
)
{
static
unsigned
int
threadsPerBlock
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
...
...
@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kClearFields_3
(
amoebaGpu
,
2
);
// on first pass, set threads/block
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"N2 no warp
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
t
hreadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
kCalculateAmoebaPmeMutualInducedFieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
if
(
1
){
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
iteration
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeMI"
,
fileId
,
outputVector
);
}
}
#endif
iteration
++
;
...
...
@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
if
(
0
){
if
(
1
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
View file @
a9054686
...
...
@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
c
AmoebaSim
.
cutoffDistance2
;
debugArray
[
index
].
z
=
c
Sim
.
nonbondedCutoffSqr
;
debugArray
[
index
].
w
=
6
.
0
f
;
...
...
@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
unsigned
int
atomJ
=
y
+
tgx
;
...
...
@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
float
ijField
[
4
][
3
];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
t
j
],
uscale
,
ijField
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
Idx
],
uscale
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
t
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
Idx
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// add to field at atomJ the field due atomI's dipole
psA
[
tj
].
field
[
0
]
+=
mask
?
ijField
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
field
[
1
]
+=
mask
?
ijField
[
1
][
1
]
:
0
.
0
f
;
psA
[
tj
].
field
[
2
]
+=
mask
?
ijField
[
1
][
2
]
:
0
.
0
f
;
if
(
flags
==
0xFFFFFFFF
){
psA
[
jIdx
].
field
[
0
]
+=
mask
?
ijField
[
1
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
field
[
1
]
+=
mask
?
ijField
[
1
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
field
[
2
]
+=
mask
?
ijField
[
1
][
2
]
:
0
.
0
f
;
// add to polar field at atomJ the field due atomI's dipole
psA
[
tj
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
3
][
0
]
:
0
.
0
f
;
psA
[
tj
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
3
][
1
]
:
0
.
0
f
;
psA
[
tj
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
3
][
2
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
3
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
3
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
3
][
2
]
:
0
.
0
f
;
}
else
{
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
2
];
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
if
(
tgx
==
0
)
{
psA
[
jIdx
].
field
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
0
];
psA
[
jIdx
].
field
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
1
];
psA
[
jIdx
].
field
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
2
];
psA
[
jIdx
].
fieldPolar
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
0
];
psA
[
jIdx
].
fieldPolar
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
1
];
psA
[
jIdx
].
fieldPolar
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
2
];
}
}
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
t
j
)
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
t
j
)
:
atomI
;
if
(
atomI
==
targetAtom
||
(
y
+
j
Idx
)
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
j
Idx
)
:
atomI
;
unsigned
int
pullBackIndex
=
0
;
unsigned
int
indexI
=
0
;
unsigned
int
indexJ
=
indexI
?
0
:
2
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
t
j
);
debugArray
[
index
].
z
=
c
AmoebaSim
.
cutoffDistance2
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
Idx
);
debugArray
[
index
].
z
=
c
Sim
.
nonbondedCutoffSqr
;
debugArray
[
index
].
w
=
7
.
0
f
;
...
...
@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// end of j-loop
// Write results
...
...
@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
#endif
lasty
=
y
;
}
}
// end of pInteractionFlag block
}
// end of x == y block
pos
++
;
}
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
View file @
a9054686
...
...
@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s: numBlocks=%d numThreads=%d %d
\n
"
,
methodName
,
numBlocks
,
numThreads
,
amoebaGpu
->
maxMapTorqueDifferencePow2
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psForce
->
Download
();
psCudaForce4
->
Download
();
amoebaGpu
->
torqueMapForce
->
Download
();
amoebaGpu
->
psTorque
->
Download
();
int
maxPrint
=
10
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Post torqueMap
\n
"
);
...
...
@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"fT[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"T[%16.9e %16.9e %16.9e]
\n
"
,
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
+
1
],
...
...
@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaGpu
->
maxMapTorqueDifference
,
amoebaGpu
->
torqueMapForce
->
_pDevStream
[
0
],
psCudaForce4
->
_pDevStream
[
0
]
);
LAUNCHERROR
(
"amoebaMapTorqueReduce_kernel
2
"
);
LAUNCHERROR
(
"amoebaMapTorqueReduce_kernel
3
"
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
View file @
a9054686
...
...
@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
}
#undef USE_PERIODIC
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef USE_PERIODIC
#undef METHOD_NAME
void
kCalculateAmoebaMultipoleForces
(
amoebaGpuContext
amoebaGpu
,
bool
hasAmoebaGeneralizedKirkwood
)
{
std
::
string
methodName
=
"kCalculateAmoebaMultipoleForces"
;
...
...
@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEField
(
amoebaGpu
);
cudaComputeAmoebaMutualInducedField
(
amoebaGpu
);
}
else
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kFindInteractionsWithinBlocksPeriodic"
);
if
(
0
){
gpu
->
psInteractionCount
->
Download
();
gpu
->
psInteractingWorkUnit
->
Download
();
gpu
->
psInteractionFlag
->
Download
();
amoebaGpu
->
psWorkUnit
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ixn count=%u
\n
"
,
gpu
->
psInteractionCount
->
_pSysStream
[
0
][
0
]
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
psInteractingWorkUnit
->
_length
;
ii
++
){
unsigned
int
x
=
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
unsigned
int
exclusions
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Cell %8u %8u [%5u %5u %1u] "
,
ii
,
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
],
x
,
y
,
exclusions
);
x
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
exclusions
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
" %8u [%5u %5u %1u] %10u
\n
"
,
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
],
x
,
y
,
exclusions
,
gpu
->
psInteractionFlag
->
_pSysStream
[
0
][
ii
]
);
}
}
else
{
}
cudaComputeAmoebaPmeFixedEField
(
amoebaGpu
);
cudaComputeAmoebaPmeMutualInducedField
(
amoebaGpu
);
}
...
...
plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
View file @
a9054686
...
...
@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
MapStringDouble
tinkerEnergies
;
MapStringVectorOfVectors
supplementary
;
MapStringIntI
isPresent
=
forceMap
.
find
(
AMOEBA_GK_FORCE
);
bool
gkIsActive
;
if
(
isPresent
!=
forceMap
.
end
()
&&
isPresent
->
second
!=
0
){
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment