Commit 041f973f authored by peastman's avatar peastman
Browse files

Bug fixes and minor optimizations

parent f623f01a
...@@ -46,10 +46,8 @@ static const int PME_ORDER = 5; ...@@ -46,10 +46,8 @@ static const int PME_ORDER = 5;
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false; bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0; int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
static float extractFloat(__m128 v, unsigned int element) { static float extractFloat(__m128& v, unsigned int element) {
float f[4]; return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)));
_mm_store_ps(f, v);
return f[element];
} }
// Define function to get the number of processors. // Define function to get the number of processors.
...@@ -153,6 +151,11 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr ...@@ -153,6 +151,11 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
int gridIndexX = _mm_extract_epi32(gridIndex, 0); int gridIndexX = _mm_extract_epi32(gridIndex, 0);
int gridIndexY = _mm_extract_epi32(gridIndex, 1); int gridIndexY = _mm_extract_epi32(gridIndex, 1);
int gridIndexZ = _mm_extract_epi32(gridIndex, 2); int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
int zindex[PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) {
zindex[j] = gridIndexZ+j;
zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
}
float charge = epsilonFactor*posq[4*i+3]; float charge = epsilonFactor*posq[4*i+3];
__m128 zdata0to3 = _mm_set_ps(extractFloat(data[3], 2), extractFloat(data[2], 2), extractFloat(data[1], 2), extractFloat(data[0], 2)); __m128 zdata0to3 = _mm_set_ps(extractFloat(data[3], 2), extractFloat(data[2], 2), extractFloat(data[1], 2), extractFloat(data[0], 2));
float zdata4 = extractFloat(data[4], 2); float zdata4 = extractFloat(data[4], 2);
...@@ -173,21 +176,12 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr ...@@ -173,21 +176,12 @@ static void spreadChargeSSE(int start, int end, float* posq, float* grid, int gr
_mm_storeu_ps(&grid[ybase+gridIndexZ], _mm_add_ps(_mm_loadu_ps(&grid[ybase+gridIndexZ]), add0to3)); _mm_storeu_ps(&grid[ybase+gridIndexZ], _mm_add_ps(_mm_loadu_ps(&grid[ybase+gridIndexZ]), add0to3));
else { else {
_mm_store_ps(temp, add0to3); _mm_store_ps(temp, add0to3);
int zindex = gridIndexZ; grid[ybase+zindex[0]] += temp[0];
grid[ybase+zindex] += temp[0]; grid[ybase+zindex[1]] += temp[1];
zindex++; grid[ybase+zindex[2]] += temp[2];
zindex -= (zindex >= gridz ? gridz : 0); grid[ybase+zindex[3]] += temp[3];
grid[ybase+zindex] += temp[1];
zindex++;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += temp[2];
zindex++;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += temp[3];
} }
int zindex = gridIndexZ+4; grid[ybase+zindex[4]] += multiplier*zdata4;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += multiplier*zdata4;
} }
} }
} }
...@@ -237,6 +231,11 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr ...@@ -237,6 +231,11 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
int gridIndexX = _mm_extract_epi32(gridIndex, 0); int gridIndexX = _mm_extract_epi32(gridIndex, 0);
int gridIndexY = _mm_extract_epi32(gridIndex, 1); int gridIndexY = _mm_extract_epi32(gridIndex, 1);
int gridIndexZ = _mm_extract_epi32(gridIndex, 2); int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
int zindex[PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) {
zindex[j] = gridIndexZ+j;
zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
}
float charge = epsilonFactor*posq[4*i+3]; float charge = epsilonFactor*posq[4*i+3];
__m256 zdata = _mm256_set_ps(0, 0, 0, extractFloat(data[4], 2), extractFloat(data[3], 2), extractFloat(data[2], 2), extractFloat(data[1], 2), extractFloat(data[0], 2)); __m256 zdata = _mm256_set_ps(0, 0, 0, extractFloat(data[4], 2), extractFloat(data[3], 2), extractFloat(data[2], 2), extractFloat(data[1], 2), extractFloat(data[0], 2));
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
...@@ -256,20 +255,11 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr ...@@ -256,20 +255,11 @@ static void spreadChargeAVX(int start, int end, float* posq, float* grid, int gr
_mm256_maskstore_ps(&grid[ybase+gridIndexZ], mask, _mm256_add_ps(_mm256_loadu_ps(&grid[ybase+gridIndexZ]), add)); _mm256_maskstore_ps(&grid[ybase+gridIndexZ], mask, _mm256_add_ps(_mm256_loadu_ps(&grid[ybase+gridIndexZ]), add));
else { else {
_mm256_store_ps(temp, add); _mm256_store_ps(temp, add);
int zindex = gridIndexZ; grid[ybase+zindex[0]] += temp[0];
grid[ybase+zindex] += temp[0]; grid[ybase+zindex[1]] += temp[1];
zindex++; grid[ybase+zindex[2]] += temp[2];
zindex -= (zindex >= gridz ? gridz : 0); grid[ybase+zindex[3]] += temp[3];
grid[ybase+zindex] += temp[1]; grid[ybase+zindex[4]] += temp[4];
zindex++;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += temp[2];
zindex++;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += temp[3];
zindex++;
zindex -= (zindex >= gridz ? gridz : 0);
grid[ybase+zindex] += temp[4];
} }
} }
} }
...@@ -421,6 +411,11 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo ...@@ -421,6 +411,11 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
int gridIndexX = _mm_extract_epi32(gridIndex, 0); int gridIndexX = _mm_extract_epi32(gridIndex, 0);
int gridIndexY = _mm_extract_epi32(gridIndex, 1); int gridIndexY = _mm_extract_epi32(gridIndex, 1);
int gridIndexZ = _mm_extract_epi32(gridIndex, 2); int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
int zindex[PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) {
zindex[j] = gridIndexZ+j;
zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
}
__m128 zdata[PME_ORDER]; __m128 zdata[PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) for (int j = 0; j < PME_ORDER; j++)
zdata[j] = _mm_set_ps(0, extractFloat(ddata[j], 2), extractFloat(data[j], 2), extractFloat(data[j], 2)); zdata[j] = _mm_set_ps(0, extractFloat(ddata[j], 2), extractFloat(data[j], 2), extractFloat(data[j], 2));
...@@ -442,15 +437,13 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo ...@@ -442,15 +437,13 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
__m128 xydata = _mm_mul_ps(xdata, _mm_set_ps(0, dy, ddy, dy)); __m128 xydata = _mm_mul_ps(xdata, _mm_set_ps(0, dy, ddy, dy));
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndexZ+iz; __m128 gridValue = _mm_set1_ps(grid[ybase+zindex[iz]]);
zindex -= (zindex >= gridz ? gridz : 0);
__m128 gridValue = _mm_set1_ps(grid[ybase+zindex]);
f = _mm_add_ps(f, _mm_mul_ps(xydata, _mm_mul_ps(zdata[iz], gridValue))); f = _mm_add_ps(f, _mm_mul_ps(xydata, _mm_mul_ps(zdata[iz], gridValue)));
} }
} }
} }
f = _mm_mul_ps(invBoxSize, _mm_mul_ps(gridSize, _mm_mul_ps(f, _mm_set1_ps(-epsilonFactor*posq[4*i+3])))); f = _mm_mul_ps(invBoxSize, _mm_mul_ps(gridSize, _mm_mul_ps(f, _mm_set1_ps(-epsilonFactor*posq[4*i+3]))));
_mm_store_ps(&force[4*i], _mm_add_ps(_mm_load_ps(&force[4*i]), f)); _mm_store_ps(&force[4*i], f);
} }
} }
...@@ -647,9 +640,9 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) { ...@@ -647,9 +640,9 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
if (isDeleted) if (isDeleted)
break; break;
if (useAVX) if (useAVX)
spreadChargeSSE(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxSize);
else
spreadChargeAVX(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxSize); spreadChargeAVX(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxSize);
else
spreadChargeSSE(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxSize);
threadWait(); threadWait();
int numGrids = threadData.size(); int numGrids = threadData.size();
for (int i = gridStart; i < gridEnd; i += 4) { for (int i = gridStart; i < gridEnd; i += 4) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment