Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
a9054686
"wrappers/vscode:/vscode.git/clone" did not exist on "00e3b767689bb96429a41dde35260c66dc2a02c1"
Commit
a9054686
authored
Oct 06, 2010
by
Mark Friedrichs
Browse files
Mods for direct PME
parent
01260070
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
833 additions
and
920 deletions
+833
-920
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+18
-0
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+30
-0
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+12
-0
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+31
-2
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+1
-1
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+3
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
...cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+5
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
...a/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
+5
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
...eba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+12
-12
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
...src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+46
-26
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
.../src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
+217
-375
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
...ms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+58
-54
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
...rms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+175
-336
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
.../src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+50
-10
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
...a/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+126
-101
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
+6
-2
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
...platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+38
-0
plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
...amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+0
-1
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
View file @
a9054686
...
...
@@ -42,6 +42,8 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
log
=
NULL
;
contextImpl
=
NULL
;
gpuInitialized
=
false
;
applyCutoff
=
0
;
multipoleForceCount
=
0
;
}
AmoebaCudaData
::~
AmoebaCudaData
()
{
...
...
@@ -122,5 +124,21 @@ void AmoebaCudaData::initializeGpu( void ) {
return
;
}
void
AmoebaCudaData
::
incrementMultipoleForceCount
(
void
)
{
multipoleForceCount
++
;
}
int
AmoebaCudaData
::
getMultipoleForceCount
(
void
)
const
{
return
multipoleForceCount
;
}
void
AmoebaCudaData
::
setApplyCutoff
(
int
inputApplyCutoff
)
{
applyCutoff
=
inputApplyCutoff
;
}
int
AmoebaCudaData
::
getApplyCutoff
(
void
)
const
{
return
applyCutoff
;
}
}
plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
View file @
a9054686
...
...
@@ -139,11 +139,41 @@ public:
*/
void
setContextImpl
(
void
*
contextImpl
);
/**
* Get multipole force count
*
* @return multipole force count
*/
int
getMultipoleForceCount
(
void
)
const
;
/**
* Get multipole force count
*
* @return multipole force count
*/
void
incrementMultipoleForceCount
(
void
);
/**
* Get multipole force count
*
* @return multipole force count
*/
int
getApplyCutoff
(
)
const
;
/**
* Get multipole force count
*
* @return multipole force count
*/
void
setApplyCutoff
(
int
applyCutoff
);
private:
CudaPlatform
::
PlatformData
&
cudaPlatformData
;
amoebaGpuContext
amoebaGpu
;
bool
hasAmoebaBonds
,
hasAmoebaGeneralizedKirkwood
,
hasAmoebaMultipole
;
int
multipoleForceCount
;
int
applyCutoff
;
KernelImpl
*
localForceKernel
;
unsigned
int
kernelCount
;
void
*
contextImpl
;
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
a9054686
...
...
@@ -669,6 +669,13 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
static
void
computeAmoebaMultipoleForce
(
AmoebaCudaData
&
data
)
{
amoebaGpuContext
gpu
=
data
.
getAmoebaGpu
();
if
(
data
.
getMultipoleForceCount
()
==
0
){
gpuCopyInteractingWorkUnit
(
gpu
);
}
if
(
data
.
getApplyCutoff
()
&&
(
data
.
getMultipoleForceCount
()
%
100
)
==
0
){
gpuReorderAtoms
(
gpu
->
gpuContext
);
}
data
.
incrementMultipoleForceCount
();
data
.
initializeGpu
();
if
(
0
&&
data
.
getLog
()
){
...
...
@@ -867,6 +874,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize
=
pmeGridDimension
[
2
];
}
gpuSetAmoebaPMEParameters
(
data
.
getAmoebaGpu
(),
(
float
)
alpha
,
xsize
,
ysize
,
zsize
);
data
.
setApplyCutoff
(
1
);
amoebaGpuContext
amoebaGpu
=
data
.
getAmoebaGpu
();
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
sim
.
nonbondedCutoffSqr
=
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
();
gpu
->
sim
.
nonbondedMethod
=
PARTICLE_MESH_EWALD
;
}
data
.
getAmoebaGpu
()
->
gpuContext
->
forces
.
push_back
(
new
ForceInfo
(
force
));
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
View file @
a9054686
...
...
@@ -350,7 +350,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(
void
)
fprintf
(
log
,
" sqrtPi %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
sqrtPi
);
(
void
)
fprintf
(
log
,
" alpha Ewald %15.7e
\n
"
,
gpu
->
sim
.
alphaEwald
);
(
void
)
fprintf
(
log
,
" PME grid dimensions %6d %6d %6d
\n
"
,
gpu
->
sim
.
pmeGridSize
.
x
,
gpu
->
sim
.
pmeGridSize
.
y
,
gpu
->
sim
.
pmeGridSize
.
z
);
(
void
)
fprintf
(
log
,
"
cutoffDistance2
%15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
cutoffDistance2
);
(
void
)
fprintf
(
log
,
"
nonbondedCutoffSqr
%15.7e
\n
"
,
gpu
->
sim
.
nonbondedCutoffSqr
);
(
void
)
fprintf
(
log
,
" electric %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
electric
);
(
void
)
fprintf
(
log
,
" box %15.7e %15.7e %15.7e
\n
"
,
gpu
->
sim
.
periodicBoxSizeX
,
gpu
->
sim
.
periodicBoxSizeY
,
gpu
->
sim
.
periodicBoxSizeZ
);
(
void
)
fprintf
(
log
,
" gkc %15.7e
\n
"
,
amoebaGpu
->
amoebaSim
.
gkc
);
...
...
@@ -1554,7 +1554,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
AMOEBA_NO_CUTOFF
,
AMOEBA_PARTICLE_MESH_EWALD
);
(
void
)
fflush
(
amoebaGpu
->
log
);
}
amoebaGpu
->
amoebaSim
.
cutoffDistance2
=
cutoffDistance
*
cutoffDistance
;
amoebaGpu
->
amoebaSim
.
sqrtPi
=
std
::
sqrt
(
3.14159265358
f
);
amoebaGpu
->
amoebaSim
.
electric
=
electricConstant
;
amoebaGpu
->
gpuContext
->
sim
.
alphaEwald
=
alphaEwald
;
...
...
@@ -4297,4 +4296,34 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
}
}
/**---------------------------------------------------------------------------------------
Track iterations for MI dipoles
@param amoebaGpu amoebaGpuContext reference
@param iteration MI iteration
--------------------------------------------------------------------------------------- */
void
gpuCopyInteractingWorkUnit
(
amoebaGpuContext
amoebaGpu
){
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
psInteractingWorkUnit
->
Download
();
gpu
->
psWorkUnit
->
Download
();
amoebaGpu
->
psWorkUnit
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"gpuCopyInteractingWorkUnit called -- to be removed.
\n
"
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
psInteractingWorkUnit
->
_length
;
ii
++
){
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
]
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
gpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
]
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
}
gpu
->
psInteractingWorkUnit
->
Upload
();
gpu
->
psWorkUnit
->
Upload
();
// ---------------------------------------------------------------------------------------
}
#undef AMOEBA_DEBUG
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
View file @
a9054686
...
...
@@ -126,7 +126,7 @@ struct cudaAmoebaGmxSimulation {
unsigned
int
numberOfAtoms
;
// number of atoms
unsigned
int
paddedNumberOfAtoms
;
// padded number of atoms
float
cutoffDistance2
;
// cutoff distance squared for PME
//
float cutoffDistance2; // cutoff distance squared for PME
float
sqrtPi
;
// sqrt(PI)
float
scalingDistanceCutoff
;
// scaling cutoff
float2
*
pDampingFactorAndThole
;
// Thole & damping factors
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
View file @
a9054686
...
...
@@ -343,6 +343,9 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu);
extern
"C"
void
gpuSetAmoebaBondOffsets
(
amoebaGpuContext
gpu
);
extern
"C"
void
gpuCopyInteractingWorkUnit
(
amoebaGpuContext
gpu
);
/*
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
View file @
a9054686
...
...
@@ -44,6 +44,11 @@ struct FixedFieldParticle {
float
gkField
[
3
];
#endif
#ifdef INCLUDE_FIXED_FIELD_BUFFERS
float
tempBuffer
[
3
];
float
tempBufferP
[
3
];
#endif
};
__device__
static
void
loadFixedFieldShared
(
struct
FixedFieldParticle
*
sA
,
unsigned
int
atomI
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedParticle.h
View file @
a9054686
...
...
@@ -24,6 +24,11 @@ struct MutualInducedParticle {
float
fieldS
[
3
];
float
fieldPolarS
[
3
];
#endif
#ifdef INCLUDE_MI_FIELD_BUFFERS
float
tempBuffer
[
3
];
float
tempBufferP
[
3
];
#endif
};
__device__
static
void
loadMutualInducedShared
(
MutualInducedParticle
*
sA
,
unsigned
int
atomI
)
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
View file @
a9054686
...
...
@@ -775,15 +775,15 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
multipole
[
8
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
2
];
multipole
[
9
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
5
];
float
*
phi
=
&
cAmoebaSim
.
pPhi
[
20
*
i
];
cAmoebaSim
.
pTorque
[
3
*
i
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phi
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
[
3
]
cAmoebaSim
.
pTorque
[
3
*
i
]
=
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phi
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
zscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
yscale
*
yscale
*
phi
[
7
]
+
multipole
[
9
]
*
xscale
*
yscale
*
phi
[
5
]
-
multipole
[
7
]
*
yscale
*
zscale
*
phi
[
8
]
-
multipole
[
9
]
*
xscale
*
zscale
*
phi
[
6
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phi
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
[
1
]
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phi
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
zscale
*
zscale
*
phi
[
8
]
+
multipole
[
7
]
*
zscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
xscale
*
zscale
*
phi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phi
[
4
]
-
multipole
[
9
]
*
yscale
*
yscale
*
phi
[
7
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
-
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phi
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
[
2
]
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phi
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
yscale
*
yscale
*
phi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phi
[
4
]
+
multipole
[
9
]
*
yscale
*
zscale
*
phi
[
8
]
-
multipole
[
7
]
*
xscale
*
yscale
*
phi
[
5
]
-
multipole
[
8
]
*
zscale
*
zscale
*
phi
[
9
]);
...
...
@@ -810,9 +810,9 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
f
.
y
*=
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
y
*
cSim
.
invPeriodicBoxSizeY
;
f
.
z
*=
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
float4
force
=
cSim
.
pForce4
[
i
];
force
.
x
+
=
f
.
x
;
force
.
y
+
=
f
.
y
;
force
.
z
+
=
f
.
z
;
force
.
x
-
=
f
.
x
;
force
.
y
-
=
f
.
y
;
force
.
z
-
=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
...
...
@@ -854,15 +854,15 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
multipole
[
8
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
2
];
multipole
[
9
]
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
i
*
9
+
5
];
float
*
phidp
=
&
cAmoebaSim
.
pPhidp
[
20
*
i
];
cAmoebaSim
.
pTorque
[
3
*
i
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
cAmoebaSim
.
pTorque
[
3
*
i
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
zscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
yscale
*
yscale
*
phidp
[
7
]
+
multipole
[
9
]
*
xscale
*
yscale
*
phidp
[
5
]
-
multipole
[
7
]
*
yscale
*
zscale
*
phidp
[
8
]
-
multipole
[
9
]
*
xscale
*
zscale
*
phidp
[
6
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
zscale
*
zscale
*
phidp
[
8
]
+
multipole
[
7
]
*
zscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
xscale
*
zscale
*
phidp
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phidp
[
4
]
-
multipole
[
9
]
*
yscale
*
yscale
*
phidp
[
7
]);
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
-
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
cAmoebaSim
.
pTorque
[
3
*
i
+
2
]
=
0.5
f
*
cAmoebaSim
.
electric
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
yscale
*
yscale
*
phidp
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phidp
[
4
]
+
multipole
[
9
]
*
yscale
*
zscale
*
phidp
[
8
]
-
multipole
[
7
]
*
xscale
*
yscale
*
phidp
[
5
]
-
multipole
[
8
]
*
zscale
*
zscale
*
phidp
[
9
]);
...
...
@@ -906,9 +906,9 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
f
.
y
*=
0.5
f
*
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
y
*
cSim
.
invPeriodicBoxSizeY
;
f
.
z
*=
0.5
f
*
cAmoebaSim
.
electric
*
cSim
.
pmeGridSize
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
float4
force
=
cSim
.
pForce4
[
i
];
force
.
x
+
=
f
.
x
;
force
.
y
+
=
f
.
y
;
force
.
z
+
=
f
.
z
;
force
.
x
-
=
f
.
x
;
force
.
y
-
=
f
.
y
;
force
.
z
-
=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
cAmoebaSim
.
electric
*
energy
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
View file @
a9054686
...
...
@@ -72,8 +72,20 @@ struct PmeDirectElectrostaticParticle {
float
torque
[
3
];
float
padding
;
float
tempForce
[
3
];
float
tempTorque
[
3
];
};
__device__
void
sumTempBuffer
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
){
atomI
.
tempForce
[
0
]
+=
atomJ
.
tempForce
[
0
];
atomI
.
tempForce
[
1
]
+=
atomJ
.
tempForce
[
1
];
atomI
.
tempForce
[
2
]
+=
atomJ
.
tempForce
[
2
];
atomI
.
tempTorque
[
0
]
+=
atomJ
.
tempTorque
[
0
];
atomI
.
tempTorque
[
1
]
+=
atomJ
.
tempTorque
[
1
];
atomI
.
tempTorque
[
2
]
+=
atomJ
.
tempTorque
[
2
];
}
/*
__device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
...
...
@@ -134,9 +146,9 @@ __device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDir
float
uiy
=
0.5
f
*
(
atomI
.
inducedDipole
[
1
]
+
atomI
.
inducedDipoleP
[
1
]);
float
uiz
=
0.5
f
*
(
atomI
.
inducedDipole
[
2
]
+
atomI
.
inducedDipoleP
[
2
]);
atomI
.
torque
[
0
]
-
=
term
*
(
atomI
.
labFrameDipole
[
1
]
*
uiz
-
atomI
.
labFrameDipole
[
2
]
*
uiy
);
atomI
.
torque
[
1
]
-
=
term
*
(
atomI
.
labFrameDipole
[
2
]
*
uix
-
atomI
.
labFrameDipole
[
0
]
*
uiz
);
atomI
.
torque
[
2
]
-
=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
atomI
.
torque
[
0
]
+
=
term
*
(
atomI
.
labFrameDipole
[
1
]
*
uiz
-
atomI
.
labFrameDipole
[
2
]
*
uiy
);
atomI
.
torque
[
1
]
+
=
term
*
(
atomI
.
labFrameDipole
[
2
]
*
uix
-
atomI
.
labFrameDipole
[
0
]
*
uiz
);
atomI
.
torque
[
2
]
+
=
term
*
(
atomI
.
labFrameDipole
[
0
]
*
uiy
-
atomI
.
labFrameDipole
[
1
]
*
uix
);
}
__device__
void
calculatePmeDirectElectrostaticPairIxn_kernel
(
PmeDirectElectrostaticParticle
&
atomI
,
PmeDirectElectrostaticParticle
&
atomJ
,
...
...
@@ -186,7 +198,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float
gfr
[
8
],
gfri
[
7
];
float
gti
[
7
],
gtri
[
7
];
float
conversionFactor
=
(
cAmoebaSim
.
electric
/
cAmoebaSim
.
dielec
);
float
conversionFactor
=
(
-
cAmoebaSim
.
electric
/
cAmoebaSim
.
dielec
);
// set the permanent multipole and induced dipole values;
...
...
@@ -219,7 +231,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
zr
-=
floor
(
zr
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
float
r
=
sqrt
(
r2
);
float
ck
=
atomJ
.
q
;
...
...
@@ -540,7 +552,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e
=
e
-
(
1.0
f
-
scalingFactors
[
MScaleIndex
])
*
erl
;
ei
=
ei
-
erli
;
*
energy
=
conversionFactor
*
(
e
+
ei
);
*
energy
=
-
conversionFactor
*
(
e
+
ei
);
// increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell;
...
...
@@ -1161,15 +1173,27 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)),
maxThreads
);
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
)
),
maxThreads
);
}
kClearFields_3
(
amoebaGpu
,
2
);
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
),
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
nonbond_threads_per_block
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
]
,
kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -1180,15 +1204,11 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
else
{
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
PmeDirectElectrostaticParticle
),
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
PmeDirectElectrostaticParticle
)
*
threadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
// gpu->sim.pInteractingWorkUnit,
// amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
(
sizeof
(
PmeDirectElectrostaticParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -1209,7 +1229,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Finished PmeDirectElectrostatic kernel execution
\n
"
);
(
void
)
fflush
(
amoebaGpu
->
log
);
int
maxPrint
=
1400
;
int
maxPrint
=
5
;
float
conversion
=
1.0
f
/
41.84
f
;
float
forceSum
[
3
]
=
{
0.0
f
,
0.0
f
,
0.0
f
};
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
...
...
@@ -1270,7 +1290,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
if
(
1
){
if
(
0
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"DebugElec
\n
"
);
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.h
View file @
a9054686
...
...
@@ -65,6 +65,9 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
unsigned
int
x
;
unsigned
int
y
;
bool
bExclusionFlag
;
int
dScaleMask
;
int2
pScaleMask
;
int2
mScaleMask
;
// Extract cell coordinates
...
...
@@ -99,49 +102,16 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
loadPmeDirectElectrostaticShared
(
&
(
sA
[
threadIdx
.
x
]),
atomI
);
if
(
!
bExclusionFlag
)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
force
[
3
];
float
torque
[
2
][
3
];
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
j
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
==
(
y
+
j
))
||
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle
.
force
[
0
]
+=
mask
?
force
[
0
]
:
0
.
0
f
;
localParticle
.
force
[
1
]
+=
mask
?
force
[
1
]
:
0
.
0
f
;
localParticle
.
force
[
2
]
+=
mask
?
force
[
2
]
:
0
.
0
f
;
localParticle
.
torque
[
0
]
+=
mask
?
torque
[
0
][
0
]
:
0
.
0
f
;
localParticle
.
torque
[
1
]
+=
mask
?
torque
[
0
][
1
]
:
0
.
0
f
;
localParticle
.
torque
[
2
]
+=
mask
?
torque
[
0
][
2
]
:
0
.
0
f
;
totalEnergy
+=
mask
?
0
.
5
*
energy
:
0
.
0
f
;
}
}
else
// bExclusion
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
scalingFactors
[
DScaleIndex
]
=
scalingFactors
[
PScaleIndex
]
=
scalingFactors
[
MScaleIndex
]
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
...
...
@@ -153,9 +123,12 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
// set scale factors
if
(
bExclusionFlag
)
{
getMaskedDScaleFactor
(
j
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
j
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
j
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
}
// force
...
...
@@ -229,8 +202,7 @@ if( atomI == targetAtom ){
}
#endif
}
}
}
// end of j-loop
// include self energy and self torque
...
...
@@ -281,11 +253,15 @@ if( atomI == targetAtom ){
outputTorque
[
offset
+
2
]
=
localParticle
.
torque
[
2
];
#endif
}
else
{
if
(
lasty
!=
y
)
{
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
// load shared data
loadPmeDirectElectrostaticShared
(
&
(
sA
[
threadIdx
.
x
]),
(
y
+
tgx
)
);
...
...
@@ -300,133 +276,43 @@ if( atomI == targetAtom ){
sA
[
threadIdx
.
x
].
torque
[
1
]
=
0
.
0
f
;
sA
[
threadIdx
.
x
].
torque
[
2
]
=
0
.
0
f
;
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
force
[
3
];
float
torque
[
2
][
3
];
unsigned
int
atomJ
=
y
+
tj
;
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
tj
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
(
atomJ
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add force and torque to atom I due atom J
localParticle
.
force
[
0
]
+=
mask
?
force
[
0
]
:
0
.
0
f
;
localParticle
.
force
[
1
]
+=
mask
?
force
[
1
]
:
0
.
0
f
;
localParticle
.
force
[
2
]
+=
mask
?
force
[
2
]
:
0
.
0
f
;
localParticle
.
torque
[
0
]
+=
mask
?
torque
[
0
][
0
]
:
0
.
0
f
;
localParticle
.
torque
[
1
]
+=
mask
?
torque
[
0
][
1
]
:
0
.
0
f
;
localParticle
.
torque
[
2
]
+=
mask
?
torque
[
0
][
2
]
:
0
.
0
f
;
totalEnergy
+=
mask
?
energy
:
0
.
0
f
;
// add force and torque to atom J due atom I
psA
[
tj
].
force
[
0
]
-=
mask
?
force
[
0
]
:
0
.
0
f
;
psA
[
tj
].
force
[
1
]
-=
mask
?
force
[
1
]
:
0
.
0
f
;
psA
[
tj
].
force
[
2
]
-=
mask
?
force
[
2
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
0
]
+=
mask
?
torque
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
1
]
+=
mask
?
torque
[
1
][
1
]
:
0
.
0
f
;
psA
[
tj
].
torque
[
2
]
+=
mask
?
torque
[
1
][
2
]
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
float
forceSign
=
(
atomI
==
targetAtom
)
?
1
.
0
f
:
-
1
.
0
f
;
float
blockId
=
2
.
0
f
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
atomJ
;
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceSign
*
force
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceSign
*
force
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceSign
*
force
[
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexI
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
mask
?
energy
:
0
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexJ
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexJ
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexJ
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
(
float
)
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
}
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
else
// bExclusion
if
(
bExclusionFlag
)
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
mScaleMask
=
cAmoebaSim
.
pM_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
scalingFactors
[
DScaleIndex
]
=
scalingFactors
[
PScaleIndex
]
=
scalingFactors
[
MScaleIndex
]
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
unsigned
int
atomJ
=
y
+
jIdx
;
float
force
[
3
];
float
torque
[
2
][
3
];
unsigned
int
atomJ
=
y
+
tj
;
// set scale factors
getMaskedDScaleFactor
(
tj
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
tj
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
tj
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
if
(
bExclusionFlag
)
{
getMaskedDScaleFactor
(
jIdx
,
dScaleMask
,
scalingFactors
+
DScaleIndex
);
getMaskedPScaleFactor
(
jIdx
,
pScaleMask
,
scalingFactors
+
PScaleIndex
);
getMaskedMScaleFactor
(
jIdx
,
mScaleMask
,
scalingFactors
+
MScaleIndex
);
}
// force
float
energy
;
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
t
j
],
calculatePmeDirectElectrostaticPairIxn_kernel
(
localParticle
,
psA
[
j
Idx
],
scalingFactors
,
force
,
torque
,
&
energy
#ifdef AMOEBA_DEBUG
,
pullBack
,
pullBack
#endif
);
...
...
@@ -448,101 +334,58 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom J due atom I
psA
[
tj
].
force
[
0
]
-=
mask
?
force
[
0
]
:
0
.
0
f
;
psA
[
tj
].
force
[
1
]
-=
mask
?
force
[
1
]
:
0
.
0
f
;
psA
[
tj
].
force
[
2
]
-=
mask
?
force
[
2
]
:
0
.
0
f
;
if
(
flags
==
0xFFFFFFFF
){
psA
[
tj
].
t
or
qu
e
[
0
]
+
=
mask
?
t
or
que
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
t
or
qu
e
[
1
]
+
=
mask
?
t
or
qu
e
[
1
]
[
1
]
:
0
.
0
f
;
psA
[
tj
].
t
or
qu
e
[
2
]
+
=
mask
?
t
or
que
[
1
][
2
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
0
]
-
=
mask
?
f
or
ce
[
0
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
1
]
-
=
mask
?
f
or
c
e
[
1
]
:
0
.
0
f
;
psA
[
jIdx
].
f
or
c
e
[
2
]
-
=
mask
?
f
or
ce
[
2
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
0
]
+=
mask
?
torque
[
1
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
1
]
+=
mask
?
torque
[
1
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
torque
[
2
]
+=
mask
?
torque
[
1
][
2
]
:
0
.
0
f
;
#ifdef AMOEBA_DEBUG
/*
energy = mask ? 0.5*energy : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
debugSetup( atomI, atomJ, debugArray, pullBack );
} */
if
(
atomI
==
targetAtom
||
atomJ
==
targetAtom
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
atomJ
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
1
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
1
:
0
;
float
forceSign
=
(
atomI
==
targetAtom
)
?
1
.
0
f
:
-
1
.
0
f
;
float
blockId
=
3
.
0
f
;
}
else
{
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
atomJ
;
debugArray
[
index
].
z
=
(
float
)
y
;
debugArray
[
index
].
w
=
blockId
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
forceSign
*
force
[
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
forceSign
*
force
[
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
forceSign
*
force
[
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
blockId
;
psA
[
threadIdx
.
x
].
tempForce
[
0
]
=
mask
?
0
.
0
f
:
force
[
0
];
psA
[
threadIdx
.
x
].
tempForce
[
1
]
=
mask
?
0
.
0
f
:
force
[
1
];
psA
[
threadIdx
.
x
].
tempForce
[
2
]
=
mask
?
0
.
0
f
:
force
[
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexI
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexI
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexI
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
energy
;
psA
[
threadIdx
.
x
].
tempTorque
[
0
]
=
mask
?
0
.
0
f
:
torque
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempTorque
[
1
]
=
mask
?
0
.
0
f
:
torque
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempTorque
[
2
]
=
mask
?
0
.
0
f
:
torque
[
1
][
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
mask
?
torque
[
indexJ
][
0
]
:
0
.
0
f
;
debugArray
[
index
].
y
=
mask
?
torque
[
indexJ
][
1
]
:
0
.
0
f
;
debugArray
[
index
].
z
=
mask
?
torque
[
indexJ
][
2
]
:
0
.
0
f
;
debugArray
[
index
].
w
=
(
float
)
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[DScaleIndex];
debugArray[index].y = dScaleVal;
debugArray[index].z = scalingFactors[PScaleIndex];
debugArray[index].w = pScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[MScaleIndex];
debugArray[index].y = mScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomI];
debugArray[index].y = labFrameDipole[3*atomI+1];
debugArray[index].z = labFrameDipole[3*atomI+2];
debugArray[index].w = 25.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomJ];
debugArray[index].y = labFrameDipole[3*atomJ+1];
debugArray[index].z = labFrameDipole[3*atomJ+2];
debugArray[index].w = 26.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = jDipole[0];
debugArray[index].y = jDipole[1];
debugArray[index].z = jDipole[2];
debugArray[index].w = 27.0f;
#endif
}
#endif
if
(
tgx
==
0
)
{
psA
[
jIdx
].
force
[
0
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
0
];
psA
[
jIdx
].
force
[
1
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
1
];
psA
[
jIdx
].
force
[
2
]
-=
psA
[
threadIdx
.
x
].
tempForce
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempForce
[
2
];
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
psA
[
jIdx
].
torque
[
0
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
0
];
psA
[
jIdx
].
torque
[
1
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
1
];
psA
[
jIdx
].
torque
[
2
]
+=
psA
[
threadIdx
.
x
].
tempTorque
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempTorque
[
2
];
}
}
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
// end of j-loop
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float
of
;
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
...
...
@@ -596,7 +439,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
of
+=
sA
[
threadIdx
.
x
].
torque
[
2
];
outputTorque
[
offset
+
2
]
=
of
;
#else
#else
unsigned
int
offset
=
3
*
(
x
+
tgx
+
(
y
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
outputForce
[
offset
]
=
localParticle
.
force
[
0
];
...
...
@@ -617,13 +460,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
outputTorque
[
offset
+
1
]
=
sA
[
threadIdx
.
x
].
torque
[
1
];
outputTorque
[
offset
+
2
]
=
sA
[
threadIdx
.
x
].
torque
[
2
];
#endif
#endif
lasty
=
y
;
}
}
// end of pInteractionFlag block
}
pos
++
;
}
//printf( "Hello thread: %d %d %d %d\n", blockIdx.x * blockDim.x + threadIdx.x, blockIdx.x, blockDim.x, threadIdx.x );
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
totalEnergy
;
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
View file @
a9054686
...
...
@@ -80,7 +80,6 @@ static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
...
...
@@ -96,7 +95,6 @@ static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int
// Reduce field
const
float
term
=
(
4.0
f
/
3.0
f
)
*
(
cSim
.
alphaEwald
*
cSim
.
alphaEwald
*
cSim
.
alphaEwald
)
/
cAmoebaSim
.
sqrtPi
;
//const float term = 0.0f;
while
(
pos
<
fieldComponents
)
{
...
...
@@ -154,7 +152,20 @@ static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
atomI
.
tempBuffer
[
1
]
+=
atomJ
.
tempBuffer
[
1
];
atomI
.
tempBuffer
[
2
]
+=
atomJ
.
tempBuffer
[
2
];
atomI
.
tempBufferP
[
0
]
+=
atomJ
.
tempBufferP
[
0
];
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
__device__
void
calculateFixedFieldRealSpacePairIxn_kernel
(
FixedFieldParticle
&
atomI
,
FixedFieldParticle
&
atomJ
,
float
dscale
,
float
pscale
,
float
fields
[
4
][
3
]
#ifdef AMOEBA_DEBUG
...
...
@@ -175,7 +186,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
yr
-=
floor
(
yr
*
cSim
.
invPeriodicBoxSizeY
+
0.5
f
)
*
cSim
.
periodicBoxSizeY
;
zr
-=
floor
(
zr
*
cSim
.
invPeriodicBoxSizeZ
+
0.5
f
)
*
cSim
.
periodicBoxSizeZ
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
float
r2
=
xr
*
xr
+
yr
*
yr
+
zr
*
zr
;
float
r
=
sqrtf
(
r2
);
// calculate the error function damping terms
...
...
@@ -310,7 +321,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
// increment the field at each site due to this interaction
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
fields
[
0
][
0
]
=
fim
[
0
]
-
fid
[
0
];
fields
[
0
][
1
]
=
fim
[
1
]
-
fid
[
1
];
...
...
@@ -345,6 +356,7 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
fields
[
2
][
2
]
=
0.0
f
;
fields
[
3
][
2
]
=
0.0
f
;
}
#ifdef AMOEBA_DEBUG
pullBack
[
0
].
x
=
xr
;
pullBack
[
0
].
y
=
yr
;
...
...
@@ -399,6 +411,7 @@ static int isNanOrInfinity( double number ){
static
void
cudaComputeAmoebaPmeDirectFixedEField
(
amoebaGpuContext
amoebaGpu
)
{
static
unsigned
int
threadsPerBlock
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
...
...
@@ -416,40 +429,27 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
// print intermediate results for the targetAtom
unsigned
int
targetAtom
=
0
;
int
maxPrint
=
3002
;
amoebaGpu
->
psE_Field
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Recip EFields In
\n
"
);
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
// E_Field
int
isNan
=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
]
);
isNan
+=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
1
]
);
isNan
+=
isNanOrInfinity
(
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E[%16.9e %16.9e %16.9e] %s
\n
"
,
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psE_Field
->
_pSysStream
[
0
][
indexOffset
+
2
],
(
isNan
?
"XXX"
:
""
)
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
){
ii
=
gpu
->
natoms
-
maxPrint
;
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Recip EFields End
\n
"
);
unsigned
int
targetAtom
=
354
;
#endif
kClearFields_3
(
amoebaGpu
,
2
);
// on first pass, set threads/block
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondT
hreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
]
,
kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
t
hreadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -459,8 +459,9 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
#endif
}
else
{
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
FixedFieldParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -471,27 +472,16 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
}
LAUNCHERROR
(
"kCalculateAmoebaPmeDirectFixedE_Field_kernel"
);
#if 0
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
//float index = 1.0f;
float index = (float) ii;
for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+1] = index;
amoebaGpu->psWorkArray_3_1->_pSysStream[0][kk+2] = index;
}
}
amoebaGpu->psWorkArray_3_1->Upload();
#endif
kReducePmeDirectE_Fields
(
amoebaGpu
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
gpu
->
psInteractionCount
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"cudaComputeAmoebaPmeDirectFixedEField: threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u
\n
"
,
threadsPerBlock
,
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
)),
(
sizeof
(
FixedFieldParticle
)
+
sizeof
(
float3
))
*
threadsPerBlock
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d
\n
"
,
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
FixedFieldParticle
),
sizeof
(
FixedFieldParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
,
gpu
->
bOutputBufferPerWarp
);
(
void
)
fflush
(
amoebaGpu
->
log
);
...
...
@@ -527,6 +517,8 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
*/
amoebaGpu
->
psE_Field
->
Download
();
amoebaGpu
->
psE_FieldPolar
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"E-field (includes self term)"
);
int
maxPrint
=
3002
;
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
...
...
@@ -558,16 +550,29 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
debugArray
->
Download
();
int
paddedNumberOfAtoms
=
amoebaGpu
->
gpuContext
->
sim
.
paddedNumberOfAtoms
;
amoebaGpu
->
gpuContext
->
psPosq4
->
Download
();
for
(
int
jj
=
0
;
jj
<
gpu
->
natoms
;
jj
++
){
int
debugIndex
=
jj
;
if
(
fabs
(
debugArray
->
_pSysStream
[
0
][
jj
+
paddedNumberOfAtoms
].
x
)
>
0.0
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d PmeFixedEField
\n
"
,
jj
);
for
(
int
kk
=
0
;
kk
<
10
;
kk
++
){
for
(
int
kk
=
0
;
kk
<
6
;
kk
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e %16.9e]
\n
"
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
x
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
y
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
z
,
debugArray
->
_pSysStream
[
0
][
debugIndex
].
w
);
debugIndex
+=
paddedNumberOfAtoms
;
}
(
void
)
fprintf
(
amoebaGpu
->
log
,
"[%16.9e %16.9e %16.9e ] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] p
\n
"
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
x
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
y
,
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
z
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
x
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
x
)
/
5.50
f
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
y
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
y
)
/
5.50
f
,
(
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
jj
].
z
-
amoebaGpu
->
gpuContext
->
psPosq4
->
_pSysStream
[
0
][
0
].
z
)
/
5.50
f
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"
\n
"
);
}
}
...
...
@@ -581,13 +586,12 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_Field
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psE_FieldPolar
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaEField"
,
fileId
,
outputVector
);
}
delete
debugArray
;
}
#endif
if
(
0
){
if
(
1
){
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
0
);
VectorOfDoubleVectors
outputVector
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
View file @
a9054686
...
...
@@ -44,10 +44,8 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
){
#ifdef AMOEBA_DEBUG
int
p
ullIndex
Max
=
1
2
;
int
maxP
ullIndex
=
1
;
float4
pullBack
[
12
];
float
dScaleVal
;
float
pScaleVal
;
#endif
extern
__shared__
FixedFieldParticle
sA
[];
...
...
@@ -65,6 +63,10 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned
int
x
;
unsigned
int
y
;
bool
bExclusionFlag
;
float
dScaleValue
;
float
pScaleValue
;
int
dScaleMask
;
int2
pScaleMask
;
// extract cell coordinates
...
...
@@ -97,75 +99,31 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
loadFixedFieldShared
(
&
(
sA
[
threadIdx
.
x
]),
atomI
);
if
(
!
bExclusionFlag
)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
float
ijField
[
4
][
3
];
// load coords, charge, ...
#ifdef AMOEBA_DEBUG
dScaleVal
=
1
.
0
f
;
pScaleVal
=
1
.
0
f
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
j
],
1
.
0
f
,
1
.
0
f
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
match
=
(
atomI
==
(
y
+
j
))
?
1
:
0
;
match
=
1
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
0
];
fieldSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
1
];
fieldSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
0
][
2
];
fieldPolarSum
[
0
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
0
];
fieldPolarSum
[
1
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
match
?
0
.
0
f
:
ijField
[
2
][
2
];
}
}
else
// bExclusion
{
if
(
bExclusionFlag
){
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
xi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
xi
*
(
xi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
dScaleValue
=
pScaleValue
=
1
.
0
f
;
}
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
// load coords, charge, ...
float
ijField
[
4
][
3
];
float
dScaleValue
;
float
pScaleValue
;
if
(
bExclusionFlag
){
getMaskedDScaleFactor
(
j
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
j
,
pScaleMask
,
&
pScaleValue
);
}
#ifdef AMOEBA_DEBUG
dScaleVal
=
dScaleValue
;
pScaleVal
=
pScaleValue
;
#endif
float
ijField
[
4
][
3
];
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
j
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
...
...
@@ -193,13 +151,6 @@ if( atomI == targetAtom ){
debugArray
[
index
].
z
=
dScaleValue
;
debugArray
[
index
].
w
=
pScaleValue
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
unsigned
int
off
=
3
*
(
x
+
tgx
+
(
x
>>
GRIDBITS
)
*
cAmoebaSim
.
paddedNumberOfAtoms
);
debugArray
[
index
].
x
=
(
float
)
x
;
debugArray
[
index
].
y
=
(
float
)
tgx
;
debugArray
[
index
].
z
=
-
2
;
debugArray
[
index
].
w
=
(
float
)
off
;
float
flag
=
7
.
0
f
;
for
(
int
ii
=
0
;
ii
<
4
;
ii
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
...
...
@@ -208,8 +159,7 @@ if( atomI == targetAtom ){
debugArray
[
index
].
z
=
match
?
0
.
0
f
:
ijField
[
indices
[
ii
]][
2
];
debugArray
[
index
].
w
=
flag
;
}
for
(
int
pullIndex
=
0
;
pullIndex
<
pullIndexMax
;
pullIndex
++
){
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
...
...
@@ -218,28 +168,9 @@ if( atomI == targetAtom ){
}
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
}
}
// Write results
...
...
@@ -253,12 +184,14 @@ if( atomI == targetAtom ){
load3dArray
(
offset
,
fieldPolarSum
,
outputEFieldPolar
);
#endif
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
if
(
lasty
!=
y
)
{
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
// flags = 0xFFFFFFFF;
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
// load coordinates, charge, ...
...
...
@@ -270,26 +203,32 @@ if( atomI == targetAtom ){
zeroFixedFieldParticleSharedField
(
&
(
sA
[
threadIdx
.
x
])
);
if
(
!
bExclusionFlag
)
{
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
if
(
bExclusionFlag
)
{
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
}
else
{
dScaleValue
=
pScaleValue
=
1
.
0
f
;
}
float
ijField
[
4
][
3
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
){
// load coords, charge, ...
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
if
(
bExclusionFlag
){
getMaskedDScaleFactor
(
jIdx
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
jIdx
,
pScaleMask
,
&
pScaleValue
);
}
#ifdef AMOEBA_DEBUG
dScaleVal
=
1
.
0
f
;
pScaleVal
=
1
.
0
f
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
tj
],
1
.
0
f
,
1
.
0
f
,
ijField
float
ijField
[
4
][
3
];
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
jIdx
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
t
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
Idx
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
...
...
@@ -301,205 +240,104 @@ pScaleVal = 1.0f;
fieldPolarSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
2
];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA
[
tj
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
tj
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
tj
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
tj
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
tj
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
#ifdef AMOEBA_DEBUG
if
(
(
atomI
==
targetAtom
||
(
y
+
tj
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
tj
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
2
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
2
:
0
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
unsigned
int
pullBackIndex
=
0
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
pullBackIndex
++
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
if
(
flags
==
0xFFFFFFFF
){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
float
flag
=
8
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
psA
[
jIdx
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
jIdx
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
jIdx
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
psA
[
jIdx
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
jIdx
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
jIdx
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
#if 0
}
else
{
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = pScaleValue + 10.0f;
#endif
}
#endif
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
else
// bExclusion
{
// Read fixed atom data into registers and GRF
unsigned
int
xi
=
x
>>
GRIDBITS
;
unsigned
int
yi
=
y
>>
GRIDBITS
;
unsigned
int
cell
=
xi
+
yi
*
cAmoebaSim
.
paddedNumberOfAtoms
/
GRID
-
yi
*
(
yi
+
1
)
/
2
;
int
dScaleMask
=
cAmoebaSim
.
pD_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
int2
pScaleMask
=
cAmoebaSim
.
pP_ScaleIndices
[
cAmoebaSim
.
pScaleIndicesIndex
[
cell
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
if
(
tgx
==
0
)
{
// load coords, charge, ...
float
ijField
[
4
][
3
];
float
dScaleValue
;
float
pScaleValue
;
getMaskedDScaleFactor
(
tj
,
dScaleMask
,
&
dScaleValue
);
getMaskedPScaleFactor
(
tj
,
pScaleMask
,
&
pScaleValue
);
#ifdef AMOEBA_DEBUG
dScaleVal
=
dScaleValue
;
pScaleVal
=
pScaleValue
;
#endif
calculateFixedFieldRealSpacePairIxn_kernel
(
localParticle
,
psA
[
tj
],
dScaleValue
,
pScaleValue
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
#endif
);
unsigned
int
outOfBounds
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
tj
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
1
:
0
;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
0
];
fieldSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
1
];
fieldSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
0
][
2
];
fieldPolarSum
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
0
];
fieldPolarSum
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
1
];
fieldPolarSum
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
2
][
2
];
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA
[
tj
].
eField
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
tj
].
eField
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
tj
].
eField
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
tj
].
eFieldP
[
0
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
tj
].
eFieldP
[
1
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
tj
].
eFieldP
[
2
]
+=
outOfBounds
?
0
.
0
f
:
ijField
[
3
][
2
];
psA
[
jIdx
].
eField
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
0
];
psA
[
jIdx
].
eField
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
1
];
psA
[
jIdx
].
eField
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
2
];
psA
[
jIdx
].
eFieldP
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
0
];
psA
[
jIdx
].
eFieldP
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
1
];
psA
[
jIdx
].
eFieldP
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
2
];
}
}
#ifdef AMOEBA_DEBUG
if
(
(
atomI
==
targetAtom
||
(
y
+
t
j
)
==
targetAtom
)
){
if
(
(
atomI
==
targetAtom
||
(
y
+
j
Idx
)
==
targetAtom
)
){
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
t
j
)
:
atomI
;
unsigned
int
index
=
(
atomI
==
targetAtom
)
?
(
y
+
j
Idx
)
:
atomI
;
unsigned
int
indexI
=
(
atomI
==
targetAtom
)
?
0
:
2
;
unsigned
int
indexJ
=
(
atomI
==
targetAtom
)
?
2
:
0
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
tj
);
debugArray
[
index
].
z
=
dScaleVal
;
debugArray
[
index
].
w
=
pScaleVal
;
unsigned
int
pullBackIndex
=
0
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
pullBackIndex
++
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullBackIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullBackIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullBackIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullBackIndex
].
w
;;
debugArray
[
index
].
y
=
(
float
)
(
y
+
jIdx
);
debugArray
[
index
].
z
=
dScaleValue
;
debugArray
[
index
].
w
=
pScaleValue
;
float
flag
=
9
.
0
f
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexI
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
x
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
0
];
debugArray
[
index
].
y
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
outOfBounds
?
0
.
0
f
:
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
for
(
int
pullIndex
=
0
;
pullIndex
<
maxPullIndex
;
pullIndex
++
){
index
+=
cAmoebaSim
.
paddedNumberOfAtoms
;
debugArray
[
index
].
x
=
pullBack
[
pullIndex
].
x
;
debugArray
[
index
].
y
=
pullBack
[
pullIndex
].
y
;
debugArray
[
index
].
z
=
pullBack
[
pullIndex
].
z
;
debugArray
[
index
].
w
=
pullBack
[
pullIndex
].
w
;
}
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// Write results
}
// j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
3
*
(
x
+
tgx
+
warp
*
cAmoebaSim
.
paddedNumberOfAtoms
);
...
...
@@ -520,8 +358,9 @@ if( (atomI == targetAtom || (y + tj) == targetAtom) ){
load3dArray
(
offset
,
sA
[
threadIdx
.
x
].
eFieldP
,
outputEFieldPolar
);
#endif
}
// end of pInteractionFlag block
lasty
=
y
;
}
}
// x == y block
pos
++
;
}
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
View file @
a9054686
...
...
@@ -36,7 +36,21 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef INCLUDE_MI_FIELD_BUFFERS
__device__
void
sumTempBuffer
(
MutualInducedParticle
&
atomI
,
MutualInducedParticle
&
atomJ
){
atomI
.
tempBuffer
[
0
]
+=
atomJ
.
tempBuffer
[
0
];
atomI
.
tempBuffer
[
1
]
+=
atomJ
.
tempBuffer
[
1
];
atomI
.
tempBuffer
[
2
]
+=
atomJ
.
tempBuffer
[
2
];
atomI
.
tempBufferP
[
0
]
+=
atomJ
.
tempBufferP
[
0
];
atomI
.
tempBufferP
[
1
]
+=
atomJ
.
tempBufferP
[
1
];
atomI
.
tempBufferP
[
2
]
+=
atomJ
.
tempBufferP
[
2
];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
...
...
@@ -152,7 +166,7 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
// increment the field at each site due to this interaction
if
(
r2
<=
c
AmoebaSim
.
cutoffDistance2
){
if
(
r2
<=
c
Sim
.
nonbondedCutoffSqr
){
fields
[
0
][
0
]
=
fimd
[
0
]
-
fid
[
0
];
fields
[
1
][
0
]
=
fkmd
[
0
]
-
fkd
[
0
];
...
...
@@ -370,6 +384,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
CUDAStream
<
float
>*
outputArray
,
CUDAStream
<
float
>*
outputPolarArray
)
{
static
unsigned
int
threadsPerBlock
=
0
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
#ifdef AMOEBA_DEBUG
...
...
@@ -389,9 +404,24 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
kClearFields_3
(
amoebaGpu
,
2
);
// on first pass, set threads/block
if
(
threadsPerBlock
==
0
){
unsigned
int
maxThreads
;
if
(
gpu
->
sm_version
>=
SM_20
)
maxThreads
=
384
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
maxThreads
=
128
;
else
maxThreads
=
64
;
threadsPerBlock
=
std
::
min
(
getThreadsPerBlock
(
amoebaGpu
,
sizeof
(
MutualInducedParticle
)),
maxThreads
);
}
if
(
gpu
->
bOutputBufferPerWarp
){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevStream[0],
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -405,14 +435,13 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG
(
void
)
fprintf
(
amoebaGpu
->
log
,
"N2 no warp
\n
"
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u
\n
"
,
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondT
hreadsPerBlock
,
amoebaGpu
->
nonbondBlocks
,
t
hreadsPerBlock
,
amoebaGpu
->
bOutputBufferPerWarp
,
sizeof
(
MutualInducedParticle
),
sizeof
(
MutualInducedParticle
)
*
t
hreadsPerBlock
,
amoebaGpu
->
energyOutputBuffers
,
(
*
gpu
->
psInteractionCount
)[
0
],
gpu
->
sim
.
workUnits
);
(
void
)
fflush
(
amoebaGpu
->
log
);
#endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
amoebaGpu
->
nonbondThreadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
amoebaGpu
->
nonbondThreadsPerBlock
>>>
(
amoebaGpu
->
psWorkUnit
->
_pDevStream
[
0
],
kCalculateAmoebaPmeMutualInducedFieldN2_kernel
<<<
amoebaGpu
->
nonbondBlocks
,
threadsPerBlock
,
sizeof
(
MutualInducedParticle
)
*
threadsPerBlock
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
,
amoebaGpu
->
psWorkArray_3_1
->
_pDevStream
[
0
],
#ifdef AMOEBA_DEBUG
amoebaGpu
->
psWorkArray_3_2
->
_pDevStream
[
0
],
...
...
@@ -717,6 +746,17 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
}
}
(
void
)
fflush
(
amoebaGpu
->
log
);
if
(
1
){
std
::
vector
<
int
>
fileId
;
fileId
.
push_back
(
iteration
);
VectorOfDoubleVectors
outputVector
;
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
3
,
gpu
->
psPosq4
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipole
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psInducedDipolePolar
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaPmeMI"
,
fileId
,
outputVector
);
}
}
#endif
iteration
++
;
...
...
@@ -725,7 +765,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
amoebaGpu
->
mutualInducedDone
=
done
;
amoebaGpu
->
mutualInducedConverged
=
(
!
done
||
iteration
>
amoebaGpu
->
mutualInducedMaxIterations
)
?
0
:
1
;
if
(
0
){
if
(
1
){
std
::
vector
<
int
>
fileId
;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
View file @
a9054686
...
...
@@ -131,7 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
);
debugArray
[
index
].
z
=
c
AmoebaSim
.
cutoffDistance2
;
debugArray
[
index
].
z
=
c
Sim
.
nonbondedCutoffSqr
;
debugArray
[
index
].
w
=
6
.
0
f
;
...
...
@@ -209,10 +209,13 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
#endif
}
else
// 100% utilization
{
// Read fixed atom data into registers and GRF
}
else
{
unsigned
int
flags
=
cSim
.
pInteractionFlag
[
pos
];
if
(
flags
==
0
)
{
// No interactions in this block.
}
else
{
if
(
lasty
!=
y
)
{
unsigned
int
atomJ
=
y
+
tgx
;
...
...
@@ -229,17 +232,18 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
for
(
unsigned
int
j
=
0
;
j
<
GRID
;
j
++
)
{
unsigned
int
jIdx
=
(
flags
==
0xFFFFFFFF
)
?
tj
:
j
;
float
ijField
[
4
][
3
];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
t
j
],
uscale
,
ijField
calculatePmeDirectMutualInducedFieldPairIxn_kernel
(
localParticle
,
psA
[
j
Idx
],
uscale
,
ijField
#ifdef AMOEBA_DEBUG
,
pullBack
,
pullBack
#endif
);
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
t
j
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
unsigned
int
mask
=
(
(
atomI
>=
cAmoebaSim
.
numberOfAtoms
)
||
((
y
+
j
Idx
)
>=
cAmoebaSim
.
numberOfAtoms
)
)
?
0
:
1
;
// add to field at atomI the field due atomJ's dipole
...
...
@@ -255,26 +259,64 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
// add to field at atomJ the field due atomI's dipole
psA
[
tj
].
field
[
0
]
+=
mask
?
ijField
[
1
][
0
]
:
0
.
0
f
;
psA
[
tj
].
field
[
1
]
+=
mask
?
ijField
[
1
][
1
]
:
0
.
0
f
;
psA
[
tj
].
field
[
2
]
+=
mask
?
ijField
[
1
][
2
]
:
0
.
0
f
;
if
(
flags
==
0xFFFFFFFF
){
psA
[
jIdx
].
field
[
0
]
+=
mask
?
ijField
[
1
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
field
[
1
]
+=
mask
?
ijField
[
1
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
field
[
2
]
+=
mask
?
ijField
[
1
][
2
]
:
0
.
0
f
;
// add to polar field at atomJ the field due atomI's dipole
psA
[
tj
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
3
][
0
]
:
0
.
0
f
;
psA
[
tj
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
3
][
1
]
:
0
.
0
f
;
psA
[
tj
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
3
][
2
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
0
]
+=
mask
?
ijField
[
3
][
0
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
1
]
+=
mask
?
ijField
[
3
][
1
]
:
0
.
0
f
;
psA
[
jIdx
].
fieldPolar
[
2
]
+=
mask
?
ijField
[
3
][
2
]
:
0
.
0
f
;
}
else
{
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
0
];
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
1
];
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
=
mask
?
0
.
0
f
:
ijField
[
1
][
2
];
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
0
];
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
1
];
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
=
mask
?
0
.
0
f
:
ijField
[
3
][
2
];
if
(
tgx
%
2
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
1
]
);
}
if
(
tgx
%
4
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
2
]
);
}
if
(
tgx
%
8
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
4
]
);
}
if
(
tgx
%
16
==
0
){
sumTempBuffer
(
psA
[
threadIdx
.
x
],
psA
[
threadIdx
.
x
+
8
]
);
}
if
(
tgx
==
0
)
{
psA
[
jIdx
].
field
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
0
];
psA
[
jIdx
].
field
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
1
];
psA
[
jIdx
].
field
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBuffer
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBuffer
[
2
];
psA
[
jIdx
].
fieldPolar
[
0
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
0
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
0
];
psA
[
jIdx
].
fieldPolar
[
1
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
1
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
1
];
psA
[
jIdx
].
fieldPolar
[
2
]
+=
psA
[
threadIdx
.
x
].
tempBufferP
[
2
]
+
psA
[
threadIdx
.
x
+
16
].
tempBufferP
[
2
];
}
}
#ifdef AMOEBA_DEBUG
if
(
atomI
==
targetAtom
||
(
y
+
t
j
)
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
t
j
)
:
atomI
;
if
(
atomI
==
targetAtom
||
(
y
+
j
Idx
)
==
targetAtom
){
unsigned
int
index
=
atomI
==
targetAtom
?
(
y
+
j
Idx
)
:
atomI
;
unsigned
int
pullBackIndex
=
0
;
unsigned
int
indexI
=
0
;
unsigned
int
indexJ
=
indexI
?
0
:
2
;
debugArray
[
index
].
x
=
(
float
)
atomI
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
t
j
);
debugArray
[
index
].
z
=
c
AmoebaSim
.
cutoffDistance2
;
debugArray
[
index
].
y
=
(
float
)
(
y
+
j
Idx
);
debugArray
[
index
].
z
=
c
Sim
.
nonbondedCutoffSqr
;
debugArray
[
index
].
w
=
7
.
0
f
;
...
...
@@ -315,31 +357,12 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray
[
index
].
y
=
ijField
[
indexJ
+
1
][
1
];
debugArray
[
index
].
z
=
ijField
[
indexJ
+
1
][
2
];
debugArray
[
index
].
w
=
flag
;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
tj
=
(
tj
+
1
)
&
(
GRID
-
1
);
}
}
// end of j-loop
// Write results
...
...
@@ -364,8 +387,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
#endif
lasty
=
y
;
}
}
// end of pInteractionFlag block
}
// end of x == y block
pos
++
;
}
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
View file @
a9054686
...
...
@@ -653,7 +653,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s: numBlocks=%d numThreads=%d %d
\n
"
,
methodName
,
numBlocks
,
numThreads
,
amoebaGpu
->
maxMapTorqueDifferencePow2
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psForce
->
Download
();
psCudaForce4
->
Download
();
amoebaGpu
->
torqueMapForce
->
Download
();
amoebaGpu
->
psTorque
->
Download
();
int
maxPrint
=
10
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Post torqueMap
\n
"
);
...
...
@@ -670,6 +670,10 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"fT[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
torqueMapForce
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"T[%16.9e %16.9e %16.9e]
\n
"
,
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
+
1
],
...
...
@@ -741,7 +745,7 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce2( amoebaGpuContext amoebaGpu,
amoebaGpu
->
maxMapTorqueDifference
,
amoebaGpu
->
torqueMapForce
->
_pDevStream
[
0
],
psCudaForce4
->
_pDevStream
[
0
]
);
LAUNCHERROR
(
"amoebaMapTorqueReduce_kernel
2
"
);
LAUNCHERROR
(
"amoebaMapTorqueReduce_kernel
3
"
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
View file @
a9054686
...
...
@@ -353,6 +353,13 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
}
#undef USE_PERIODIC
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kFindInteractingBlocks.h"
#undef USE_PERIODIC
#undef METHOD_NAME
void
kCalculateAmoebaMultipoleForces
(
amoebaGpuContext
amoebaGpu
,
bool
hasAmoebaGeneralizedKirkwood
)
{
std
::
string
methodName
=
"kCalculateAmoebaMultipoleForces"
;
...
...
@@ -372,6 +379,37 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaFixedEField
(
amoebaGpu
);
cudaComputeAmoebaMutualInducedField
(
amoebaGpu
);
}
else
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
kFindBlockBoundsPeriodic_kernel
<<<
(
gpu
->
psGridBoundingBox
->
_length
+
63
)
/
64
,
64
>>>
();
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
LAUNCHERROR
(
"kFindInteractionsWithinBlocksPeriodic"
);
if
(
0
){
gpu
->
psInteractionCount
->
Download
();
gpu
->
psInteractingWorkUnit
->
Download
();
gpu
->
psInteractionFlag
->
Download
();
amoebaGpu
->
psWorkUnit
->
Download
();
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Ixn count=%u
\n
"
,
gpu
->
psInteractionCount
->
_pSysStream
[
0
][
0
]
);
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
psInteractingWorkUnit
->
_length
;
ii
++
){
unsigned
int
x
=
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
];
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
unsigned
int
exclusions
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Cell %8u %8u [%5u %5u %1u] "
,
ii
,
gpu
->
psInteractingWorkUnit
->
_pSysStream
[
0
][
ii
],
x
,
y
,
exclusions
);
x
=
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
];
y
=
((
x
>>
2
)
&
0x7fff
)
<<
GRIDBITS
;
exclusions
=
(
x
&
0x1
);
x
=
(
x
>>
17
)
<<
GRIDBITS
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
" %8u [%5u %5u %1u] %10u
\n
"
,
amoebaGpu
->
psWorkUnit
->
_pSysStream
[
0
][
ii
],
x
,
y
,
exclusions
,
gpu
->
psInteractionFlag
->
_pSysStream
[
0
][
ii
]
);
}
}
else
{
}
cudaComputeAmoebaPmeFixedEField
(
amoebaGpu
);
cudaComputeAmoebaPmeMutualInducedField
(
amoebaGpu
);
}
...
...
plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
View file @
a9054686
...
...
@@ -4535,7 +4535,6 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
MapStringDouble
tinkerEnergies
;
MapStringVectorOfVectors
supplementary
;
MapStringIntI
isPresent
=
forceMap
.
find
(
AMOEBA_GK_FORCE
);
bool
gkIsActive
;
if
(
isPresent
!=
forceMap
.
end
()
&&
isPresent
->
second
!=
0
){
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment