Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e249751c
Commit
e249751c
authored
Jun 26, 2013
by
peastman
Browse files
Further optimizations to CPU based PME
parent
041f973f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
62 additions
and
37 deletions
+62
-37
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+62
-37
No files found.
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
e249751c
...
@@ -159,29 +159,42 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
...
@@ -159,29 +159,42 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m128
zdata0to3
=
_mm_set_ps
(
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
__m128
zdata0to3
=
_mm_set_ps
(
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
float
zdata4
=
extractFloat
(
data
[
4
],
2
);
float
zdata4
=
extractFloat
(
data
[
4
],
2
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
if
(
gridIndexZ
+
4
<
gridz
)
{
int
xbase
=
gridIndexX
+
ix
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
int
xbase
=
gridIndexX
+
ix
;
xbase
=
xbase
*
gridy
*
gridz
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
float
xdata
=
extractFloat
(
data
[
ix
],
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
charge
*
xdata
*
extractFloat
(
data
[
iy
],
1
);
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
if
(
gridIndexZ
+
4
<
gridz
)
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
],
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add0to3
));
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
],
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add0to3
));
else
{
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
}
else
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
_mm_store_ps
(
temp
,
add0to3
);
_mm_store_ps
(
temp
,
add0to3
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
grid
[
ybase
+
zindex
[
3
]]
+=
temp
[
3
];
grid
[
ybase
+
zindex
[
3
]]
+=
temp
[
3
];
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
}
}
}
}
...
@@ -231,29 +244,41 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
...
@@ -231,29 +244,41 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
int
zindex
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m256
zdata
=
_mm256_set_ps
(
0
,
0
,
0
,
extractFloat
(
data
[
4
],
2
),
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
__m256
zdata
=
_mm256_set_ps
(
0
,
0
,
0
,
extractFloat
(
data
[
4
],
2
),
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
if
(
gridIndexZ
+
5
<
gridz
)
{
int
xbase
=
gridIndexX
+
ix
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
int
xbase
=
gridIndexX
+
ix
;
xbase
=
xbase
*
gridy
*
gridz
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
float
xdata
=
extractFloat
(
data
[
ix
],
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
charge
*
xdata
*
extractFloat
(
data
[
iy
],
1
);
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
_mm256_maskstore_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
,
_mm256_add_ps
(
_mm256_maskload_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
),
add
));
if
(
gridIndexZ
+
5
<
gridz
)
}
_mm256_maskstore_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
,
_mm256_add_ps
(
_mm256_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add
));
}
else
{
}
else
{
int
zindex
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
_mm256_store_ps
(
temp
,
add
);
_mm256_store_ps
(
temp
,
add
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment