"openmmapi/src/PythonForce.cpp" did not exist on "00b5aa4d4609e56762beb74e6ffb51ad5439c82d"
Unverified Commit 6a5d323c authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Speed up dispatch send by refining loop unrolling (#385)

* nits

* hack unrolled warp copy

* Revert "nits"

This reverts commit 3e1b28d9b17f2c1cc46403d432ca576dbf15bd45.
parent c78b9ed7
......@@ -16,8 +16,21 @@
for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]); \
} \
for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N); __i += 32) \
ST_FUNC(__dst + __i, LD_FUNC(__src + __i)); \
{ \
int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); \
_Pragma("unroll") \
for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) { \
if (__i + __j * 32 < (N)) { \
unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32); \
} \
} \
_Pragma("unroll") \
for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) { \
if (__i + __j * 32 < (N)) { \
ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]); \
} \
} \
} \
}
namespace deep_ep {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment