Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4e7070c2
Commit
4e7070c2
authored
Apr 30, 2026
by
one
Browse files
Add wave64 LDS spreading in HIP LJ-PME
parent
939ecf28
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
156 additions
and
6 deletions
+156
-6
platforms/common/include/openmm/common/CommonCalcNonbondedForce.h
...s/common/include/openmm/common/CommonCalcNonbondedForce.h
+5
-1
platforms/common/src/CommonCalcNonbondedForce.cpp
platforms/common/src/CommonCalcNonbondedForce.cpp
+26
-5
platforms/common/src/kernels/pme.cc
platforms/common/src/kernels/pme.cc
+125
-0
No files found.
platforms/common/include/openmm/common/CommonCalcNonbondedForce.h
View file @
4e7070c2
...
@@ -45,7 +45,9 @@ namespace OpenMM {
...
@@ -45,7 +45,9 @@ namespace OpenMM {
class
CommonCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
class
CommonCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
public:
public:
CommonCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
ComputeContext
&
cc
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CommonCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
ComputeContext
&
cc
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
hasInitializedKernel
(
false
),
cc
(
cc
),
pmeio
(
NULL
),
stepsToSort
(
0
),
dispersionStepsToSort
(
0
)
{
hasInitializedKernel
(
false
),
cc
(
cc
),
pmeio
(
NULL
),
stepsToSort
(
0
),
dispersionStepsToSort
(
0
),
usePmeDispersionWave64LdsSpread
(
false
),
pmeDispersionSpreadWaveSize
(
0
),
pmeDispersionSpreadBlockSize
(
0
),
pmeDispersionAtomsPerWave
(
0
),
pmeDispersionAtomsPerBlock
(
0
)
{
}
}
~
CommonCalcNonbondedForceKernel
();
~
CommonCalcNonbondedForceKernel
();
/**
/**
...
@@ -170,6 +172,8 @@ private:
...
@@ -170,6 +172,8 @@ private:
int
stepsToSort
;
int
stepsToSort
;
int
dispersionStepsToSort
;
int
dispersionStepsToSort
;
bool
usePmeQueue
,
deviceIsCpu
,
useFixedPointChargeSpreading
,
useCpuPme
;
bool
usePmeQueue
,
deviceIsCpu
,
useFixedPointChargeSpreading
,
useCpuPme
;
bool
usePmeDispersionWave64LdsSpread
;
int
pmeDispersionSpreadWaveSize
,
pmeDispersionSpreadBlockSize
,
pmeDispersionAtomsPerWave
,
pmeDispersionAtomsPerBlock
;
bool
hasCoulomb
,
hasLJ
,
doLJPME
,
usePosqCharges
,
recomputeParams
,
hasOffsets
;
bool
hasCoulomb
,
hasLJ
,
doLJPME
,
usePosqCharges
,
recomputeParams
,
hasOffsets
;
NonbondedMethod
nonbondedMethod
;
NonbondedMethod
nonbondedMethod
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
...
...
platforms/common/src/CommonCalcNonbondedForce.cpp
View file @
4e7070c2
...
@@ -287,6 +287,15 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons
...
@@ -287,6 +287,15 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons
bool
usePeriodic
=
(
nonbondedMethod
!=
NoCutoff
&&
nonbondedMethod
!=
CutoffNonPeriodic
);
bool
usePeriodic
=
(
nonbondedMethod
!=
NoCutoff
&&
nonbondedMethod
!=
CutoffNonPeriodic
);
doLJPME
=
(
nonbondedMethod
==
LJPME
&&
hasLJ
);
doLJPME
=
(
nonbondedMethod
==
LJPME
&&
hasLJ
);
usePosqCharges
=
hasCoulomb
?
cc
.
requestPosqCharges
()
:
false
;
usePosqCharges
=
hasCoulomb
?
cc
.
requestPosqCharges
()
:
false
;
pmeDispersionSpreadWaveSize
=
64
;
pmeDispersionSpreadBlockSize
=
256
;
pmeDispersionAtomsPerWave
=
pmeDispersionSpreadWaveSize
/
PmeOrder
;
pmeDispersionAtomsPerBlock
=
(
pmeDispersionSpreadBlockSize
/
pmeDispersionSpreadWaveSize
)
*
pmeDispersionAtomsPerWave
;
// The LDS spread path assumes wave64 execution and is only used for HIP LJ-PME fixed point spreading.
usePmeDispersionWave64LdsSpread
=
(
getPlatform
().
getName
()
==
"HIP"
&&
doLJPME
&&
useFixedPointChargeSpreading
&&
PmeOrder
==
5
&&
cc
.
getSIMDWidth
()
==
pmeDispersionSpreadWaveSize
&&
cc
.
getMaxThreadBlockSize
()
>=
pmeDispersionSpreadBlockSize
);
map
<
string
,
string
>
defines
;
map
<
string
,
string
>
defines
;
defines
[
"HAS_COULOMB"
]
=
(
hasCoulomb
?
"1"
:
"0"
);
defines
[
"HAS_COULOMB"
]
=
(
hasCoulomb
?
"1"
:
"0"
);
defines
[
"HAS_LENNARD_JONES"
]
=
(
hasLJ
?
"1"
:
"0"
);
defines
[
"HAS_LENNARD_JONES"
]
=
(
hasLJ
?
"1"
:
"0"
);
...
@@ -833,9 +842,16 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -833,9 +842,16 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDefines
[
"RECIP_EXP_FACTOR"
]
=
cc
.
doubleToString
(
M_PI
*
M_PI
/
(
dispersionAlpha
*
dispersionAlpha
));
pmeDefines
[
"RECIP_EXP_FACTOR"
]
=
cc
.
doubleToString
(
M_PI
*
M_PI
/
(
dispersionAlpha
*
dispersionAlpha
));
pmeDefines
[
"USE_LJPME"
]
=
"1"
;
pmeDefines
[
"USE_LJPME"
]
=
"1"
;
pmeDefines
[
"CHARGE_FROM_SIGEPS"
]
=
"1"
;
pmeDefines
[
"CHARGE_FROM_SIGEPS"
]
=
"1"
;
if
(
usePmeDispersionWave64LdsSpread
)
{
pmeDefines
[
"PME_USE_WAVE64_LDS_SPREAD"
]
=
"1"
;
pmeDefines
[
"PME_SPREAD_WAVE_SIZE"
]
=
cc
.
intToString
(
pmeDispersionSpreadWaveSize
);
pmeDefines
[
"PME_SPREAD_BLOCK_SIZE"
]
=
cc
.
intToString
(
pmeDispersionSpreadBlockSize
);
pmeDefines
[
"PME_SPREAD_ATOMS_PER_WAVE"
]
=
cc
.
intToString
(
pmeDispersionAtomsPerWave
);
pmeDefines
[
"PME_SPREAD_ATOMS_PER_BLOCK"
]
=
cc
.
intToString
(
pmeDispersionAtomsPerBlock
);
}
program
=
cc
.
compileProgram
(
CommonKernelSources
::
pme
,
pmeDefines
);
program
=
cc
.
compileProgram
(
CommonKernelSources
::
pme
,
pmeDefines
);
pmeDispersionGridIndexKernel
=
program
->
createKernel
(
"findAtomGridIndex"
);
pmeDispersionGridIndexKernel
=
program
->
createKernel
(
"findAtomGridIndex"
);
pmeDispersionSpreadChargeKernel
=
program
->
createKernel
(
"gridSpreadCharge"
);
pmeDispersionSpreadChargeKernel
=
program
->
createKernel
(
usePmeDispersionWave64LdsSpread
?
"gridSpreadChargeWave64Lds"
:
"gridSpreadCharge"
);
pmeDispersionConvolutionKernel
=
program
->
createKernel
(
"reciprocalConvolution"
);
pmeDispersionConvolutionKernel
=
program
->
createKernel
(
"reciprocalConvolution"
);
pmeDispersionEvalEnergyKernel
=
program
->
createKernel
(
"gridEvaluateEnergy"
);
pmeDispersionEvalEnergyKernel
=
program
->
createKernel
(
"gridEvaluateEnergy"
);
pmeDispersionInterpolateForceKernel
=
program
->
createKernel
(
"gridInterpolateForce"
);
pmeDispersionInterpolateForceKernel
=
program
->
createKernel
(
"gridInterpolateForce"
);
...
@@ -1070,9 +1086,14 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1070,9 +1086,14 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionSpreadChargeKernel
->
setArg
(
8
,
recipBoxVectorsFloat
[
1
]);
pmeDispersionSpreadChargeKernel
->
setArg
(
8
,
recipBoxVectorsFloat
[
1
]);
pmeDispersionSpreadChargeKernel
->
setArg
(
9
,
recipBoxVectorsFloat
[
2
]);
pmeDispersionSpreadChargeKernel
->
setArg
(
9
,
recipBoxVectorsFloat
[
2
]);
}
}
if
(
usePmeDispersionWave64LdsSpread
)
{
const
int
workSize
=
((
cc
.
getNumAtoms
()
+
pmeDispersionAtomsPerBlock
-
1
)
/
pmeDispersionAtomsPerBlock
)
*
pmeDispersionSpreadBlockSize
;
pmeDispersionSpreadChargeKernel
->
execute
(
workSize
,
pmeDispersionSpreadBlockSize
);
}
else
pmeDispersionSpreadChargeKernel
->
execute
(
cc
.
getNumAtoms
());
pmeDispersionSpreadChargeKernel
->
execute
(
cc
.
getNumAtoms
());
if
(
useFixedPointChargeSpreading
)
if
(
useFixedPointChargeSpreading
)
pmeDispersionFinishSpreadChargeKernel
->
execute
(
gridSizeX
*
gridSizeY
*
g
ridSizeZ
);
pmeDispersionFinishSpreadChargeKernel
->
execute
(
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionG
ridSizeZ
);
dispersionFft
->
execFFT
(
pmeGrid1
,
pmeGrid2
,
true
);
dispersionFft
->
execFFT
(
pmeGrid1
,
pmeGrid2
,
true
);
if
(
cc
.
getUseDoublePrecision
())
{
if
(
cc
.
getUseDoublePrecision
())
{
pmeDispersionConvolutionKernel
->
setArg
(
4
,
recipBoxVectors
[
0
]);
pmeDispersionConvolutionKernel
->
setArg
(
4
,
recipBoxVectors
[
0
]);
...
@@ -1093,8 +1114,8 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1093,8 +1114,8 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if
(
!
hasCoulomb
)
if
(
!
hasCoulomb
)
cc
.
clearBuffer
(
pmeEnergyBuffer
);
cc
.
clearBuffer
(
pmeEnergyBuffer
);
if
(
includeEnergy
)
if
(
includeEnergy
)
pmeDispersionEvalEnergyKernel
->
execute
(
gridSizeX
*
gridSizeY
*
g
ridSizeZ
);
pmeDispersionEvalEnergyKernel
->
execute
(
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionG
ridSizeZ
);
pmeDispersionConvolutionKernel
->
execute
(
gridSizeX
*
gridSizeY
*
g
ridSizeZ
);
pmeDispersionConvolutionKernel
->
execute
(
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionG
ridSizeZ
);
dispersionFft
->
execFFT
(
pmeGrid2
,
pmeGrid1
,
false
);
dispersionFft
->
execFFT
(
pmeGrid2
,
pmeGrid1
,
false
);
setPeriodicBoxArgs
(
cc
,
pmeDispersionInterpolateForceKernel
,
3
);
setPeriodicBoxArgs
(
cc
,
pmeDispersionInterpolateForceKernel
,
3
);
if
(
cc
.
getUseDoublePrecision
())
{
if
(
cc
.
getUseDoublePrecision
())
{
...
...
platforms/common/src/kernels/pme.cc
View file @
4e7070c2
...
@@ -120,6 +120,131 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
...
@@ -120,6 +120,131 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
}
}
}
}
#if defined(USE_HIP) && defined(CHARGE_FROM_SIGEPS) && PME_ORDER == 5 && defined(PME_USE_WAVE64_LDS_SPREAD)
// HIP LJ-PME dispersion spread variant for wave64 hardware. It keeps the generic
// PME spread algorithm unchanged, but maps one atom across PME_ORDER wavefront
// lanes so the five z-slices can be accumulated in parallel after one lane
// computes the atom's charge, grid index, and B-spline coefficients.
KERNEL
void
gridSpreadChargeWave64Lds
(
GLOBAL
const
real4
*
RESTRICT
posq
,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL
mm_ulong
*
RESTRICT
pmeGrid
,
#else
GLOBAL
real
*
RESTRICT
pmeGrid
,
#endif
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
recipBoxVecX
,
real4
recipBoxVecY
,
real4
recipBoxVecZ
,
GLOBAL
const
int2
*
RESTRICT
pmeAtomGridIndex
,
GLOBAL
const
float2
*
RESTRICT
sigmaEpsilon
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
((
real
)
(
PME_ORDER
-
1
));
// B-spline coefficients are stored as [localAtom][order] so all wavefront
// lanes for the same atom read contiguous LDS entries during the spread phase.
LOCAL
real
sharedDataX
[
PME_SPREAD_BLOCK_SIZE
];
LOCAL
real
sharedDataY
[
PME_SPREAD_BLOCK_SIZE
];
LOCAL
real
sharedDataZ
[
PME_SPREAD_BLOCK_SIZE
];
LOCAL
real
sharedCharge
[
PME_SPREAD_ATOMS_PER_BLOCK
];
LOCAL
int
sharedGridX
[
PME_SPREAD_ATOMS_PER_BLOCK
];
LOCAL
int
sharedGridY
[
PME_SPREAD_ATOMS_PER_BLOCK
];
LOCAL
int
sharedGridZ
[
PME_SPREAD_ATOMS_PER_BLOCK
];
// Each wavefront handles floor(PME_SPREAD_WAVE_SIZE/PME_ORDER) atoms. For
// PME_ORDER=5, lanes 0..4 spread one atom, lanes 5..9 spread the next, and
// the remaining lanes in the wavefront are inactive.
const
int
waveLane
=
LOCAL_ID
&
(
PME_SPREAD_WAVE_SIZE
-
1
);
const
int
wave
=
LOCAL_ID
/
PME_SPREAD_WAVE_SIZE
;
int
atomLane
=
waveLane
-
(
waveLane
/
PME_ORDER
)
*
PME_ORDER
;
int
atomInWave
=
waveLane
/
PME_ORDER
;
int
localAtom
=
wave
*
PME_SPREAD_ATOMS_PER_WAVE
+
atomInWave
;
const
int
atomBlockStride
=
(
GLOBAL_SIZE
/
PME_SPREAD_BLOCK_SIZE
)
*
PME_SPREAD_ATOMS_PER_BLOCK
;
for
(
int
atomBlockStart
=
GROUP_ID
*
PME_SPREAD_ATOMS_PER_BLOCK
;
atomBlockStart
<
NUM_ATOMS
;
atomBlockStart
+=
atomBlockStride
)
{
int
atomIndex
=
atomBlockStart
+
localAtom
;
bool
isActive
=
(
waveLane
<
PME_SPREAD_ATOMS_PER_WAVE
*
PME_ORDER
&&
atomIndex
<
NUM_ATOMS
);
if
(
isActive
&&
atomLane
==
0
)
{
int
atom
=
pmeAtomGridIndex
[
atomIndex
].
x
;
real4
pos
=
posq
[
atom
];
const
float2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
GRID_SIZE_X
,
((
int
)
t
.
y
)
%
GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
GRID_SIZE_Z
);
sharedCharge
[
localAtom
]
=
charge
;
sharedGridX
[
localAtom
]
=
gridIndex
.
x
;
sharedGridY
[
localAtom
]
=
gridIndex
.
y
;
sharedGridZ
[
localAtom
]
=
gridIndex
.
z
;
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
((
real
)
(
j
-
1
));
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
int
offset
=
localAtom
*
PME_ORDER
+
j
;
sharedDataX
[
offset
]
=
data
[
j
].
x
;
sharedDataY
[
offset
]
=
data
[
j
].
y
;
sharedDataZ
[
offset
]
=
data
[
j
].
z
;
}
}
// All consumers of an atom's LDS data are in the same wavefront as the
// producing lane, so a wave-level barrier is sufficient here.
SYNC_WARPS
if
(
isActive
)
{
real
charge
=
sharedCharge
[
localAtom
];
if
(
charge
!=
0
)
{
int
gridX
=
sharedGridX
[
localAtom
];
int
gridY
=
sharedGridY
[
localAtom
];
int
gridZ
=
sharedGridZ
[
localAtom
];
int
zindex
=
gridZ
+
atomLane
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real
dz
=
sharedDataZ
[
localAtom
*
PME_ORDER
+
atomLane
]
*
charge
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridX
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dzdx
=
dz
*
sharedDataX
[
localAtom
*
PME_ORDER
+
ix
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridY
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
ybase
*
GRID_SIZE_Z
;
int
index
=
xbase
+
ybase
+
zindex
;
real
add
=
dzdx
*
sharedDataY
[
localAtom
*
PME_ORDER
+
iy
];
if
(
fabs
(
add
)
>
2.3e-10
f
)
{
// Smallest value representable in 64 bit fixed point
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
ATOMIC_ADD
(
&
pmeGrid
[
index
],
(
mm_ulong
)
realToFixedPoint
(
add
));
#if defined(__GFX12__) && defined(USE_HIP)
// Workaround for rare cases when few values of pmeGrid are very large and
// incorrect. The cause is unknown. Why this workaround or other irrelevant
// changes like printf help is also unknown.
asm
volatile
(
"s_wait_storecnt 0x0"
);
#endif
#else
ATOMIC_ADD
(
&
pmeGrid
[
index
],
add
);
#endif
}
}
}
}
}
SYNC_WARPS
}
}
#endif
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
KERNEL
void
finishSpreadCharge
(
KERNEL
void
finishSpreadCharge
(
...
...
one
@one
mentioned in commit
c26187aa
·
May 31, 2026
mentioned in commit
c26187aa
mentioned in commit c26187aa9429fd3d1da4a5fe5150efcb0ce5dbfe
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment