Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e249751c
"plugins/cudacompiler/src/CudaCompilerKernelFactory.cpp" did not exist on "05198df6b07d5f7f543fde46212c07147f1c3b11"
Commit
e249751c
authored
Jun 26, 2013
by
peastman
Browse files
Further optimizations to CPU based PME
parent
041f973f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
62 additions
and
37 deletions
+62
-37
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+62
-37
No files found.
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
e249751c
...
@@ -159,32 +159,45 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
...
@@ -159,32 +159,45 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m128
zdata0to3
=
_mm_set_ps
(
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
__m128
zdata0to3
=
_mm_set_ps
(
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
float
zdata4
=
extractFloat
(
data
[
4
],
2
);
float
zdata4
=
extractFloat
(
data
[
4
],
2
);
if
(
gridIndexZ
+
4
<
gridz
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
extractFloat
(
data
[
ix
],
0
);
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
charge
*
xdata
*
extractFloat
(
data
[
iy
],
1
);
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
if
(
gridIndexZ
+
4
<
gridz
)
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
],
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add0to3
));
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
],
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add0to3
));
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
}
else
{
else
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
));
_mm_store_ps
(
temp
,
add0to3
);
_mm_store_ps
(
temp
,
add0to3
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
grid
[
ybase
+
zindex
[
3
]]
+=
temp
[
3
];
grid
[
ybase
+
zindex
[
3
]]
+=
temp
[
3
];
}
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
}
}
}
}
}
}
}
static
void
spreadChargeAVX
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
static
void
spreadChargeAVX
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
...
@@ -231,29 +244,41 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
...
@@ -231,29 +244,41 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
);
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m256
zdata
=
_mm256_set_ps
(
0
,
0
,
0
,
extractFloat
(
data
[
4
],
2
),
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
if
(
gridIndexZ
+
5
<
gridz
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
_mm256_maskstore_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
,
_mm256_add_ps
(
_mm256_maskload_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
),
add
));
}
}
}
else
{
int
zindex
[
PME_ORDER
];
int
zindex
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
}
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m256
zdata
=
_mm256_set_ps
(
0
,
0
,
0
,
extractFloat
(
data
[
4
],
2
),
extractFloat
(
data
[
3
],
2
),
extractFloat
(
data
[
2
],
2
),
extractFloat
(
data
[
1
],
2
),
extractFloat
(
data
[
0
],
2
));
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
extractFloat
(
data
[
ix
],
0
);
float
xdata
=
charge
*
extractFloat
(
data
[
ix
],
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
charge
*
xdata
*
extractFloat
(
data
[
iy
],
1
);
float
multiplier
=
xdata
*
extractFloat
(
data
[
iy
],
1
);
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
__m256
add
=
_mm256_mul_ps
(
zdata
,
_mm256_set1_ps
(
multiplier
));
if
(
gridIndexZ
+
5
<
gridz
)
_mm256_maskstore_ps
(
&
grid
[
ybase
+
gridIndexZ
],
mask
,
_mm256_add_ps
(
_mm256_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
]),
add
));
else
{
_mm256_store_ps
(
temp
,
add
);
_mm256_store_ps
(
temp
,
add
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment