Commit 0b5d58d7 authored by Charlles Abreu's avatar Charlles Abreu
Browse files

Conflict resolution in TestSplineFilter.cpp

parents 9026dbe7 b0d13582
......@@ -32,7 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#if defined(__ARM__)
#if defined(__ARM__) || defined(__ARM64__)
#include "vectorize_neon.h"
#elif defined(__PPC__)
#include "vectorize_ppc.h"
......
......@@ -46,11 +46,23 @@ class fvec8 {
public:
__m256 val;
fvec8() {}
fvec8() = default;
fvec8(float v) : val(_mm256_set1_ps(v)) {}
fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
fvec8(__m256 v) : val(v) {}
fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
/** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec8(const float* table, const int idx[8]) {
// :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
// Use gather and static assert? Conditional code?
val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
}
operator __m256() const {
return val;
}
......@@ -115,6 +127,12 @@ public:
return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
}
operator ivec8() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static fvec8 expandBitsToMask(int bitmask);
};
/**
......@@ -160,6 +178,19 @@ inline ivec8::operator fvec8() const {
return _mm256_cvtepi32_ps(val);
}
inline fvec8 fvec8::expandBitsToMask(int bitmask) {
// Put a copy of bit 0 in the first element, bit 1 in the second, and so on.
const auto expandedBits =
_mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi8(bitmask)),
_mm256_castsi256_ps(_mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128)));
// The individual bits are essentially extremely small floating-point values. By comparing against zero
// (even a floating-point zero), the individual bits are turned into a complete element mask.
const auto elementMask = _mm256_cmp_ps(expandedBits, __m256(), _CMP_NEQ_OQ);
return elementMask;
}
// Functions that operate on fvec8s.
static inline fvec8 floor(const fvec8& v) {
......@@ -208,6 +239,11 @@ static inline float dot8(const fvec8& v1, const fvec8& v2) {
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
}
static inline float reduceAdd(const fvec8 v) {
// :TODO: There are more efficient ways to do this.
return dot8(v, fvec8(1.0f));
}
static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
......@@ -231,6 +267,14 @@ static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in
out4 = _mm256_insertf128_ps(out4, i8, 1);
}
/** Given a vec4[8] input array, generate 4 vec8 outputs. The first output contains all the first elements
* the second output the second elements, and so on. Note that the prototype is essentially differing only
* in output type so it can be overloaded in other SIMD fvec types.
*/
static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
}
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
out1 = in1.lowerVec();
out2 = in2.lowerVec();
......@@ -244,6 +288,13 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
_MM_TRANSPOSE4_PS(out5, out6, out7, out8);
}
/**
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
*/
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
}
// Functions that operate on ivec8s.
static inline bool any(const ivec8& v) {
......@@ -268,10 +319,100 @@ static inline fvec8 operator/(float v1, const fvec8& v2) {
return fvec8(v1)/v2;
}
// Operations for blending fvec8s based on an ivec8.
// Operation for blending fvec8 from a full bitmask.
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
}
static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
return blend(0.0f, v, mask);
}
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
const auto lower = index.lowerVec();
const auto upper = index.upperVec();
// Gather all the separate memory data together. Each vector will have two values
// which get used, and two which are ultimately discarded.
fvec4 t0(table + lower[0]);
fvec4 t1(table + lower[1]);
fvec4 t2(table + lower[2]);
fvec4 t3(table + lower[3]);
fvec4 t4(table + upper[0]);
fvec4 t5(table + upper[1]);
fvec4 t6(table + upper[2]);
fvec4 t7(table + upper[3]);
// Tranposing the 8 vectors above will put all the first elements into one output
// vector, all the second elements into the next vector and so on.
fvec8 discard0, discard1;
transpose(t0, t1, t2, t3, t4, t5, t6, t7, out0, out1, discard0, discard1);
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3 X4 X5 X6 X7
* Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
* Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + ...)
* output[1] = (Y0 + Y1 + Y2 + ...)
* output[2] = (Z0 + Z1 + Z2 + ...)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
// The general strategy for a vector reduce-add operation is to take values from
// different parts of the vector and overlap them a different part of the vector and then
// add together. Repeat this several times until all values have been summed. Initially 8
// values can be reduced to 4, 4 to 2, and 2 to 1. The following code essentially does this
// but exploits two things:
// - having multiple inputs means that some vectors can be combined together to amortise the
// cost of shuffling.
// - the output destinations are part of anther vector, so accumulate into the correct
// offsets to start with, instead of reducing to position 0 and re-inserting to the correct
// output location.
//
// As far as possible, accumulate x, y and z into their output positions in both the top and
// bottom 128-bits to exploit in-lane permutes as much as possible early on.
// Shuffle X and Z together to form one reduced vector.
// X2 X3 Z0 Z1 X6 X7 Z4 Z5
const auto xzshuf = _mm256_shuffle_ps(x, z, 0b01001110);
// Blend X and Z together to form another reduced vector, overlapping the previous.
// X0 X1 Z2 Z3 X4 X5 Z6 Z7
const auto xzblend = _mm256_blend_ps(x, z, 0b11001100);
// Add them together to form:
// (X0 + X2) (X1 + X3) (Z0 + Z2) (Z1 + Z3) etc.
const auto xz0 = _mm256_add_ps(xzshuf, xzblend);
// Now there's only one vector containing all values. Shuffle again to form another overlap,
// and then add.
const auto xz1 = _mm256_permute_ps(xz0, 0b00110001);
const auto xz2 = _mm256_add_ps(xz0, xz1);
// Work on Z on its own as there's nothing else to work with. Start by permuting it to
// form some overlaps, and then add:
// (Y0 + Y2) (Y1 + Y3) - - (Y4 + Y6) (Y5 + Y7) - -
const auto yshuf = _mm256_permute_ps(y, 0b11101110);
const auto y0 = _mm256_add_ps(yshuf, y);
// Shift the bottom float of each pair to the right, into the correct Y location.
const auto y1 = _mm256_permute_ps(y0, 0b00000000);
const auto y2 = _mm256_add_ps(y0, y1);
// Blend the results together to give a complete set of XYZ in the correct respective positions
// of both top and bottom 128-bit lanes.
const auto laneResult = fvec8(_mm256_blend_ps(xz2, y2, 0b00100010));
return laneResult.lowerVec() + laneResult.upperVec();
}
#endif /*OPENMM_VECTORIZE8_H_*/
......@@ -32,7 +32,12 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifdef __ANDROID__
#include <cpu-features.h>
#else
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#include <arm_neon.h>
#include <cmath>
......@@ -48,8 +53,16 @@ float32x4_t log_ps(float32x4_t);
* Determine whether ivec4 and fvec4 are supported on this processor.
*/
static bool isVec4Supported() {
#ifdef __ANDROID__
uint64_t features = android_getCpuFeatures();
return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
#elif defined(__ARM__)
unsigned long features = getauxval(AT_HWCAP);
return (features & HWCAP_NEON) != 0;
#else
unsigned long features = getauxval(AT_HWCAP);
return (features & HWCAP_ASIMD) != 0;
#endif
}
class ivec4;
......@@ -61,7 +74,7 @@ class fvec4 {
public:
float32x4_t val;
fvec4() {}
fvec4() = default;
fvec4(float v) : val(vdupq_n_f32(v)) {}
fvec4(float v1, float v2, float v3, float v4) {
float v[] = {v1, v2, v3, v4};
......@@ -72,6 +85,16 @@ public:
operator float32x4_t() const {
return val;
}
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
float operator[](int i) const {
switch (i) {
case 0:
......@@ -88,6 +111,16 @@ public:
void store(float* v) const {
vst1q_f32(v, val);
}
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = vgetq_lane_f32(val, 0);
v[1] = vgetq_lane_f32(val, 1);
v[2] = vgetq_lane_f32(val, 2);
}
fvec4 operator+(const fvec4& other) const {
return vaddq_f32(val, other);
}
......@@ -146,6 +179,13 @@ public:
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
}
operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
};
/**
......@@ -241,6 +281,12 @@ inline ivec4::operator fvec4() const {
return fvec4(vcvtq_f32_s32(val));
}
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
......@@ -284,6 +330,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
}
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
v1[2]*v2[0] - v1[0]*v2[2],
......@@ -301,6 +351,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = t4.val[1];
}
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
......@@ -343,6 +409,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
}
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above.
static inline fvec4 round(const fvec4& v) {
......@@ -361,4 +431,38 @@ static inline fvec4 ceil(const fvec4& v) {
return rounded + blend(0.0f, 1.0f, rounded<v);
}
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_NEON_H_*/
......@@ -56,7 +56,7 @@ class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4() = default;
fvec4(float v) {
val = {v, v, v, v};
}
......@@ -67,6 +67,16 @@ public:
fvec4(const float* v) {
val = *((__m128*) v);
}
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const {
return val;
}
......@@ -76,6 +86,15 @@ public:
void store(float* v) const {
*((__m128*) v) = val;
}
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = val[0];
v[1] = val[1];
v[2] = val[2];
}
fvec4 operator+(const fvec4& other) const {
return val+other;
}
......@@ -116,6 +135,13 @@ public:
ivec4 operator>=(const fvec4& other) const;
ivec4 operator<=(const fvec4& other) const;
operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
};
/**
......@@ -227,6 +253,13 @@ inline ivec4::operator fvec4() const {
return __builtin_convertvector(val, __m128);
}
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) {
......@@ -252,6 +285,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return temp[0]+temp[1];
}
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
__m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
......@@ -269,6 +306,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7);
}
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
......@@ -312,6 +365,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
}
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
......@@ -358,5 +415,40 @@ static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v;
}
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_PNACL_H_*/
......@@ -57,7 +57,7 @@ class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4() = default;
fvec4(float v) {
val = (__m128) {v, v, v, v};
}
......@@ -68,6 +68,16 @@ public:
fvec4(const float* v) {
val = *((__m128*) v);
}
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const {
return val;
}
......@@ -77,6 +87,16 @@ public:
void store(float* v) const {
*((__m128*) v) = val;
}
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = val[0];
v[1] = val[1];
v[2] = val[2];
}
fvec4 operator+(const fvec4& other) const {
return vec_add(val, other.val);
}
......@@ -117,6 +137,13 @@ public:
ivec4 operator>=(const fvec4& other) const;
ivec4 operator<=(const fvec4& other) const;
operator ivec4() const;
/***
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
};
/**
......@@ -228,6 +255,13 @@ inline ivec4::operator fvec4() const {
return (__m128) {(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
}
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) {
......@@ -253,6 +287,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return temp[0]+temp[1];
}
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
__m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
......@@ -275,6 +313,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = vec_perm(a2, a4, perm4);
}
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
......@@ -289,8 +343,8 @@ static inline ivec4 abs(const ivec4& v) {
return vec_abs(v.val);
}
static inline bool any(const __m128i& v) {
return !vec_all_eq(v, ivec4(0).val);
static inline bool any(const ivec4 v) {
return !vec_all_eq(v.val, ivec4(0).val);
}
// Mathematical operators involving a scalar and a vector.
......@@ -317,6 +371,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
}
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
......@@ -355,5 +413,39 @@ static inline fvec4 sqrt(const fvec4& v) {
return vec_sqrt(v.val);
}
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_PPC_H_*/
......@@ -32,7 +32,12 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifdef __AVX__
#include <immintrin.h>
#else
#include <smmintrin.h>
#endif
#include "hardware.h"
// This file defines classes and functions to simplify vectorizing code with SSE.
......@@ -63,11 +68,21 @@ class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4() = default;
fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const {
return val;
}
......@@ -79,6 +94,20 @@ public:
void store(float* v) const {
_mm_storeu_ps(v, val);
}
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
// This code could be called from objects compiled for better SIMD domains (e.g., AVX) so conditionally
// compile in the most efficient variant of the instruction.
#ifdef __AVX__
_mm_maskstore_ps(v, _mm_setr_epi32(-1, -1, -1, 0), val);
#else
_mm_maskmoveu_si128 (_mm_castps_si128(val), _mm_setr_epi32(-1, -1, -1, 0), (char*)v);
#endif
}
fvec4 operator+(const fvec4& other) const {
return _mm_add_ps(val, other);
}
......@@ -131,6 +160,12 @@ public:
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static fvec4 expandBitsToMask(int bitmask);
};
/**
......@@ -214,6 +249,13 @@ inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val);
}
inline fvec4 fvec4::expandBitsToMask(int bitmask) {
// Not optimal for SSE (see AVX implementation for better version)
// but useful as an example for other SIMD architectures.
const auto values = fvec4(bitmask & 1, bitmask & 2, bitmask & 4, bitmask & 8);
return values != fvec4(0.0f);
}
// Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) {
......@@ -273,6 +315,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
......@@ -283,6 +329,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
_MM_TRANSPOSE4_PS(v1, v2, v3, v4);
}
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
......@@ -319,10 +381,48 @@ static inline fvec4 operator/(float v1, const fvec4& v2) {
return fvec4(v1)/v2;
}
// Operations for blending fvec4s based on an ivec4.
// Operations for blending fvec4
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
}
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
return blend(0.0f, v, mask);
}
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
// :TODO: Could be made more efficient.
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_SSE_H_*/
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015-2016 Stanford University and the Authors. *
* Portions copyright (c) 2015-2020 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -128,3 +128,15 @@ void CompoundIntegrator::stateChanged(State::DataType changed) {
double CompoundIntegrator::computeKineticEnergy() {
return integrators[currentIntegrator]->computeKineticEnergy();
}
void CompoundIntegrator::createCheckpoint(std::ostream& stream) const {
stream.write((char*) &currentIntegrator, sizeof(int));
for (int i = 0; i < integrators.size(); i++)
integrators[i]->createCheckpoint(stream);
}
void CompoundIntegrator::loadCheckpoint(std::istream& stream) {
stream.read((char*) &currentIntegrator, sizeof(int));
for (int i = 0; i < integrators.size(); i++)
integrators[i]->loadCheckpoint(stream);
}
......@@ -454,6 +454,7 @@ void ContextImpl::createCheckpoint(ostream& stream) {
stream.write((char*) &param.second, sizeof(double));
}
updateStateDataKernel.getAs<UpdateStateDataKernel>().createCheckpoint(*this, stream);
integrator.createCheckpoint(stream);
stream.flush();
}
......@@ -480,6 +481,7 @@ void ContextImpl::loadCheckpoint(istream& stream) {
parameters[name] = value;
}
updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream);
integrator.loadCheckpoint(stream);
hasSetPositions = true;
integrator.stateChanged(State::Positions);
integrator.stateChanged(State::Velocities);
......
......@@ -114,6 +114,31 @@ bool CustomIntegrator::kineticEnergyRequiresForce() const {
return keNeedsForce;
}
void CustomIntegrator::createCheckpoint(std::ostream& stream) const {
for (int i = 0; i < getNumGlobalVariables(); i++) {
double value = getGlobalVariable(i);
stream.write((char*) &value, sizeof(double));
}
vector<Vec3> values;
for (int i = 0; i < getNumPerDofVariables(); i++) {
getPerDofVariable(i, values);
stream.write((char*) values.data(), sizeof(Vec3)*values.size());
}
}
void CustomIntegrator::loadCheckpoint(std::istream& stream) {
double value;
for (int i = 0; i < getNumGlobalVariables(); i++) {
stream.read((char*) &value, sizeof(double));
setGlobalVariable(i, value);
}
vector<Vec3> values(context->getSystem().getNumParticles());
for (int i = 0; i < getNumPerDofVariables(); i++) {
stream.read((char*) values.data(), sizeof(Vec3)*values.size());
setPerDofVariable(i, values);
}
}
void CustomIntegrator::step(int steps) {
if (context == NULL)
throw OpenMMException("This Integrator is not bound to a context!");
......
......@@ -55,7 +55,10 @@ std::vector<double> NoseHooverChain::getYoshidaSuzukiWeights() const {
case 5:
return {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065,
0.2967324292201065};
case 7:
return {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,
-1.17767998417887, 0.235573213359357, 0.784513610477560};
default:
throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3, or 5.");
throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3,5, or 7.");
}
}
......@@ -68,11 +68,6 @@ NoseHooverIntegrator::NoseHooverIntegrator(double temperature, double collisionF
NoseHooverIntegrator::~NoseHooverIntegrator() {}
std::pair<double, double> NoseHooverIntegrator::propagateChain(std::pair<double, double> kineticEnergy, int chainID) {
return nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, noseHooverChains.at(chainID), kineticEnergy, getStepSize());
}
int NoseHooverIntegrator::addThermostat(double temperature, double collisionFrequency,
int chainLength, int numMTS, int numYoshidaSuzuki) {
hasSubsystemThermostats_ = false;
......@@ -271,10 +266,10 @@ double NoseHooverIntegrator::computeKineticEnergy() {
double kE = 0.0;
if(noseHooverChains.size() > 0) {
for (const auto &nhc: noseHooverChains){
kE += nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
kE += kernel.getAs<IntegrateNoseHooverStepKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
}
} else {
kE = vvKernel.getAs<IntegrateVelocityVerletStepKernel>().computeKineticEnergy(*context, *this);
kE = kernel.getAs<IntegrateNoseHooverStepKernel>().computeKineticEnergy(*context, *this);
}
return kE;
}
......@@ -287,7 +282,7 @@ double NoseHooverIntegrator::computeHeatBathEnergy() {
double energy = 0;
for(auto &nhc : noseHooverChains) {
if (context && (nhc.getNumDegreesOfFreedom() > 0)) {
energy += nhcKernel.getAs<NoseHooverChainKernel>().computeHeatBathEnergy(*context, nhc);
energy += kernel.getAs<IntegrateNoseHooverStepKernel>().computeHeatBathEnergy(*context, nhc);
}
}
return energy;
......@@ -300,10 +295,8 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
context = &contextRef;
const System& system = context->getSystem();
owner = &contextRef.getOwner();
vvKernel = context->getPlatform().createKernel(IntegrateVelocityVerletStepKernel::Name(), contextRef);
vvKernel.getAs<IntegrateVelocityVerletStepKernel>().initialize(contextRef.getSystem(), *this);
nhcKernel = context->getPlatform().createKernel(NoseHooverChainKernel::Name(), contextRef);
nhcKernel.getAs<NoseHooverChainKernel>().initialize();
kernel = context->getPlatform().createKernel(IntegrateNoseHooverStepKernel::Name(), contextRef);
kernel.getAs<IntegrateNoseHooverStepKernel>().initialize(contextRef.getSystem(), *this);
forcesAreValid = false;
// check for drude particles and build the Nose-Hoover Chains
......@@ -329,34 +322,30 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
}
void NoseHooverIntegrator::cleanup() {
vvKernel = Kernel();
nhcKernel = Kernel();
kernel = Kernel();
}
vector<string> NoseHooverIntegrator::getKernelNames() {
std::vector<std::string> names;
names.push_back(NoseHooverChainKernel::Name());
names.push_back(IntegrateVelocityVerletStepKernel::Name());
names.push_back(IntegrateNoseHooverStepKernel::Name());
return names;
}
void NoseHooverIntegrator::step(int steps) {
if (context == NULL)
throw OpenMMException("This Integrator is not bound to a context!");
std::pair<double, double> scale, kineticEnergy;
for (int i = 0; i < steps; ++i) {
if(context->updateContextState())
forcesAreValid = false;
for(auto &nhc : noseHooverChains) {
kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
}
vvKernel.getAs<IntegrateVelocityVerletStepKernel>().execute(*context, *this, forcesAreValid);
for(auto &nhc : noseHooverChains) {
kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
}
context->calcForcesAndEnergy(true, false);
kernel.getAs<IntegrateNoseHooverStepKernel>().execute(*context, *this, forcesAreValid);
}
}
void NoseHooverIntegrator::createCheckpoint(std::ostream& stream) const {
kernel.getAs<IntegrateNoseHooverStepKernel>().createCheckpoint(*context, stream);
}
void NoseHooverIntegrator::loadCheckpoint(std::istream& stream) {
kernel.getAs<IntegrateNoseHooverStepKernel>().loadCheckpoint(*context, stream);
}
\ No newline at end of file
......@@ -86,8 +86,8 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
// Create the system of equations to solve.
vector<double> a(n), b(n), c(n), rhs(n);
a[0] = 0.0;
vector<double> a(n-1), b(n-1), c(n-1), rhs(n-1);
a[0] = x[n-1]-x[n-2];
b[0] = 2.0*(x[1]-x[0]+x[n-1]-x[n-2]);
c[0] = x[1]-x[0];
rhs[0] = 6.0*((y[1]-y[0])/(x[1]-x[0]) - (y[n-1]-y[n-2])/(x[n-1]-x[n-2]));
......@@ -97,17 +97,14 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
c[i] = x[i+1]-x[i];
rhs[i] = 6.0*((y[i+1]-y[i])/(x[i+1]-x[i]) - (y[i]-y[i-1])/(x[i]-x[i-1]));
}
a[n-1] = 0.0;
b[n-1] = 1.0;
c[n-1] = 0.0;
rhs[n-1] = 0.0;
double beta = x[n-1]-x[n-2];
double alpha = -1.0;
double beta = a[0];
double alpha = c[n-2];
double gamma = -b[0];
// This is a cyclic tridiagonal matrix. We solve it using the Sherman-Morrison method,
// which involves solving two tridiagonal systems.
n--;
b[0] -= gamma;
b[n-1] -= alpha*beta/gamma;
solveTridiagonalMatrix(a, b, c, rhs, deriv);
......@@ -118,6 +115,7 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
double scale = (deriv[0]+beta*deriv[n-1]/gamma)/(1.0+z[0]+beta*z[n-1]/gamma);
for (int i = 0; i < n; i++)
deriv[i] -= scale*z[i];
deriv[n] = deriv[0];
}
double SplineFitter::evaluateSpline(const vector<double>& x, const vector<double>& y, const vector<double>& deriv, double t) {
......
......@@ -944,11 +944,13 @@ private:
/*
* This kernel is invoked by NoseHooverIntegrator to take one time step.
*/
class CommonIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
class CommonIntegrateNoseHooverStepKernel : public IntegrateNoseHooverStepKernel {
public:
CommonIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
IntegrateVelocityVerletStepKernel(name, platform), cc(cc), hasInitializedKernels(false) { }
~CommonIntegrateVelocityVerletStepKernel() {}
CommonIntegrateNoseHooverStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
IntegrateNoseHooverStepKernel(name, platform), cc(cc), hasInitializedKernels(false),
hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
hasInitializedScaleVelocitiesKernel(false), hasInitializedPropagateKernel(false) {}
~CommonIntegrateNoseHooverStepKernel() {}
/**
* Initialize the kernel.
*
......@@ -972,39 +974,16 @@ public:
* @param integrator the NoseHooverIntegrator this kernel is being used for
*/
double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
private:
ComputeContext& cc;
float prevMaxPairDistance;
ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
ComputeKernel kernel1, kernel2, kernel3, kernelHardWall;
bool hasInitializedKernels;
};
/**
* This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
* and update the associated particle velocities.
*/
class CommonNoseHooverChainKernel : public NoseHooverChainKernel {
public:
CommonNoseHooverChainKernel(std::string name, const Platform& platform, ComputeContext& cc) :
NoseHooverChainKernel(name, platform), cc(cc), hasInitializedPropagateKernel(false),
hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
hasInitializedScaleVelocitiesKernel(false) {}
~CommonNoseHooverChainKernel() {}
/**
* Initialize the kernel.
*/
void initialize();
/**
* Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
*
* @param context the context in which to execute this kernel
* @param noseHooverChain the object describing the chain to be propagated.
* @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
* @param kineticEnergy the {center of mass, relative} kineticEnergies of the particles being thermostated by this chain.
* @param timeStep the time step used by the integrator.
* @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
* @return the velocity scale factor to apply to the particles associated with this heat bath.
*/
std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> kineticEnergy, double timeStep);
/**
* Execute the kernal that computes the total (kinetic + potential) heat bath energy.
*
......@@ -1012,7 +991,7 @@ public:
* @param noseHooverChain the chain whose energy is to be determined.
* @return the total heat bath energy.
*/
double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain);
/**
* Execute the kernel that computes the kinetic energy for a subset of atoms,
* or the relative kinetic energy of Drude particles with respect to their parent atoms
......@@ -1020,22 +999,37 @@ public:
* @param context the context in which to execute this kernel
* @param noseHooverChain the chain whose energy is to be determined.
* @param downloadValue whether the computed value should be downloaded and returned.
*
*/
std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
std::pair<double, double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
/**
* Execute the kernel that scales the velocities of particles associated with a nose hoover chain
*
* @param context the context in which to execute this kernel
* @param noseHooverChain the chain whose energy is to be determined.
* @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
* @param scaleFactor the multiplicative factor by which {absolute, relative} velocities are scaled.
*/
void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactor);
/**
* Write the chain states to a checkpoint.
*/
void createCheckpoint(ContextImpl& context, std::ostream& stream) const;
/**
* Load the chain states from a checkpoint.
*/
void loadCheckpoint(ContextImpl& context, std::istream& stream);
private:
int sumWorkGroupSize;
ComputeContext& cc;
float prevMaxPairDistance;
ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer, oldDelta;
std::map<int, ComputeArray> chainState;
ComputeKernel kernel1, kernel2, kernel3, kernel4, kernelHardWall;
bool hasInitializedKernels;
ComputeKernel reduceEnergyKernel;
ComputeKernel computeHeatBathEnergyKernel;
ComputeKernel computeAtomsKineticEnergyKernel;
ComputeKernel computePairsKineticEnergyKernel;
ComputeKernel scaleAtomsVelocitiesKernel;
ComputeKernel scalePairsVelocitiesKernel;
ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
std::map<int, ComputeArray> atomlists, pairlists;
std::map<int, ComputeKernel> propagateKernels;
......@@ -1043,12 +1037,6 @@ private:
bool hasInitializedKineticEnergyKernel;
bool hasInitializedHeatBathEnergyKernel;
bool hasInitializedScaleVelocitiesKernel;
ComputeKernel reduceEnergyKernel;
ComputeKernel computeHeatBathEnergyKernel;
ComputeKernel computeAtomsKineticEnergyKernel;
ComputeKernel computePairsKineticEnergyKernel;
ComputeKernel scaleAtomsVelocitiesKernel;
ComputeKernel scalePairsVelocitiesKernel;
};
/**
......
......@@ -130,12 +130,6 @@ public:
* @param timeShift the amount by which to shift the velocities in time
*/
double computeKineticEnergy(double timeShift);
/**
* Get the data structure that holds the state of all Nose-Hoover chains
*/
std::map<int, ComputeArray>& getNoseHooverChainState() {
return noseHooverChainState;
}
protected:
virtual void applyConstraintsImpl(bool constrainVelocities, double tol) = 0;
ComputeContext& context;
......@@ -174,7 +168,6 @@ protected:
ComputeArray vsiteLocalCoordsWeights;
ComputeArray vsiteLocalCoordsPos;
ComputeArray vsiteLocalCoordsStartIndex;
std::map<int, ComputeArray> noseHooverChainState;
int randomPos, lastSeed, numVsites;
bool hasOverlappingVsites;
mm_double2 lastStepSize;
......
......@@ -5640,20 +5640,68 @@ double CommonIntegrateLangevinMiddleStepKernel::computeKineticEnergy(ContextImpl
return cc.getIntegrationUtilities().computeKineticEnergy(0.0);
}
void CommonIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
void CommonIntegrateNoseHooverStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
cc.initializeContexts();
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
map<string, string> defines;
defines["BOLTZ"] = cc.doubleToString(BOLTZ);
ComputeProgram program = cc.compileProgram(CommonKernelSources::velocityVerlet, defines);
kernel1 = program->createKernel("integrateVelocityVerletPart1");
kernel2 = program->createKernel("integrateVelocityVerletPart2");
kernel3 = program->createKernel("integrateVelocityVerletPart3");
kernelHardWall = program->createKernel("integrateVelocityVerletHardWall");
ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverIntegrator, defines);
kernel1 = program->createKernel("integrateNoseHooverMiddlePart1");
kernel2 = program->createKernel("integrateNoseHooverMiddlePart2");
kernel3 = program->createKernel("integrateNoseHooverMiddlePart3");
kernel4 = program->createKernel("integrateNoseHooverMiddlePart4");
if (useDouble) {
oldDelta.initialize<mm_double4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
} else {
oldDelta.initialize<mm_float4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
}
kernelHardWall = program->createKernel("integrateNoseHooverHardWall");
prevMaxPairDistance = -1.0f;
maxPairDistanceBuffer.initialize<float>(cc, 1, "maxPairDistanceBuffer");
int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0};"
"for(int i=0;i<1;++i) {"
"const real ys = arr[i];";
defines["END_YS_LOOP"] = "}";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751};"
"for(int i=0;i<3;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065};"
"for(int i=0;i<5;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[7] = {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,-1.17767998417887, 0.235573213359357, 0.784513610477560};"
"for(int i=0;i<7;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[7] = program->createKernel("propagateNoseHooverChain");
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
reduceEnergyKernel = program->createKernel("reduceEnergyPair");
computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
int energyBufferSize = cc.getEnergyBuffer().getSize();
if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
else
energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
}
void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
void CommonIntegrateNoseHooverStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
IntegrationUtilities& integration = cc.getIntegrationUtilities();
int paddedNumAtoms = cc.getPaddedNumAtoms();
double dt = integrator.getStepSize();
......@@ -5700,32 +5748,39 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
pairListBuffer.upload(tmp);
pairTemperatureBuffer.upload(tmp2);
}
int totalAtoms = cc.getNumAtoms();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
kernel1->addArg(numAtoms);
kernel1->addArg(numPairs);
kernel1->addArg(paddedNumAtoms);
kernel1->addArg(cc.getIntegrationUtilities().getStepSize());
kernel1->addArg(cc.getPosq());
kernel1->addArg(cc.getVelm());
kernel1->addArg(cc.getLongForceBuffer());
kernel1->addArg(integration.getPosDelta());
kernel1->addArg(integration.getStepSize());
kernel1->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
kernel1->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
if (cc.getUseMixedPrecision())
kernel1->addArg(cc.getPosqCorrection());
kernel2->addArg(numParticles);
kernel2->addArg(cc.getIntegrationUtilities().getStepSize());
kernel2->addArg(cc.getPosq());
kernel2->addArg(totalAtoms);
kernel2->addArg(cc.getVelm());
kernel2->addArg(integration.getPosDelta());
kernel2->addArg(oldDelta);
kernel2->addArg(integration.getStepSize());
kernel3->addArg(totalAtoms);
kernel3->addArg(cc.getVelm());
kernel3->addArg(integration.getPosDelta());
kernel3->addArg(oldDelta);
kernel3->addArg(integration.getStepSize());
kernel4->addArg(totalAtoms);
kernel4->addArg(cc.getPosq());
kernel4->addArg(cc.getVelm());
kernel4->addArg(integration.getPosDelta());
kernel4->addArg(oldDelta);
kernel4->addArg(integration.getStepSize());
if (cc.getUseMixedPrecision())
kernel2->addArg(cc.getPosqCorrection());
kernel4->addArg(cc.getPosqCorrection());
if (numPairs > 0) {
kernelHardWall->addArg(numPairs);
kernelHardWall->addArg(maxPairDistanceBuffer);
kernelHardWall->addArg(cc.getIntegrationUtilities().getStepSize());
kernelHardWall->addArg(integration.getStepSize());
kernelHardWall->addArg(cc.getPosq());
kernelHardWall->addArg(cc.getVelm());
kernelHardWall->addArg(pairListBuffer);
......@@ -5733,78 +5788,50 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
if (cc.getUseMixedPrecision())
kernelHardWall->addArg(cc.getPosqCorrection());
}
kernel3->addArg(numAtoms);
kernel3->addArg(numPairs);
kernel3->addArg(paddedNumAtoms);
kernel3->addArg(cc.getIntegrationUtilities().getStepSize());
kernel3->addArg(cc.getPosq());
kernel3->addArg(cc.getVelm());
kernel3->addArg(cc.getLongForceBuffer());
kernel3->addArg(integration.getPosDelta());
kernel3->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
kernel3->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
if (cc.getUseMixedPrecision())
kernel3->addArg(cc.getPosqCorrection());
}
/*
* Carry out the integration
* Carry out the LF-middle integration (c.f. J. Phys. Chem. A 2019, 123, 6056−6079)
*/
// Advance the velocities a half step
// Velocity update
kernel1->execute(std::max(numAtoms, numPairs));
integration.applyConstraints(integrator.getConstraintTolerance());
// Advance particle positions a full step
integration.applyVelocityConstraints(integrator.getConstraintTolerance());
// Position update
kernel2->execute(numParticles);
// Apply the thermostat
int numChains = integrator.getNumThermostats();
for(int chain = 0; chain < numChains; ++chain) {
const auto &thermostatChain = integrator.getThermostat(chain);
auto KEs = computeMaskedKineticEnergy(context, thermostatChain, false);
auto scaleFactors = propagateChain(context, thermostatChain, KEs, dt);
scaleVelocities(context, thermostatChain, scaleFactors);
}
// Position update
kernel3->execute(numParticles);
integration.applyConstraints(integrator.getConstraintTolerance());
// Apply constraint forces
kernel4->execute(numAtoms);
// Make sure any Drude-like particles have not wandered too far from home
if (numPairs > 0) kernelHardWall->execute(numPairs);
integration.computeVirtualSites();
context.calcForcesAndEnergy(true, false);
forcesAreValid = true;
// Update velocities another half step
kernel3->execute(std::max(numAtoms, numPairs));
integration.applyVelocityConstraints(integrator.getConstraintTolerance());
// Update the time and step count.
cc.setTime(cc.getTime()+dt);
cc.setStepCount(cc.getStepCount()+1);
cc.reorderAtoms();
// Reduce UI lag.
#ifdef WIN32
cc.flushQueue();
#endif
}
double CommonIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
double CommonIntegrateNoseHooverStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
return cc.getIntegrationUtilities().computeKineticEnergy(0);
}
void CommonNoseHooverChainKernel::initialize() {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
map<string, string> defines;
int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0}; for(int i=0;i<1;++i) { const real ys = arr[i];";
defines["END_YS_LOOP"] = "}";
ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751}; for(int i=0;i<3;++i) { const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}; for(int i=0;i<5;++i) { const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
reduceEnergyKernel = program->createKernel("reduceEnergyPair");
computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
int energyBufferSize = cc.getEnergyBuffer().getSize();
if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
else
energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
}
std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
std::pair<double, double> CommonIntegrateNoseHooverStepKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
int chainID = nhc.getChainID();
int nAtoms = nhc.getThermostatedAtoms().size();
......@@ -5813,12 +5840,10 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
int numYS = nhc.getNumYoshidaSuzukiTimeSteps();
int numMTS = nhc.getNumMultiTimeSteps();
if (numYS != 1 && numYS != 3 && numYS != 5) {
throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
if (numYS != 1 && numYS != 3 && numYS != 5 && numYS != 7) {
throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, 5, or 7.");
}
auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) {
if (useDouble) {
std::vector<mm_double2> zeros{{0,0}};
......@@ -5973,15 +5998,13 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
return {0, 0};
}
double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
double CommonIntegrateNoseHooverStepKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
int chainID = nhc.getChainID();
int chainLength = nhc.getChainLength();
auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
bool absChainIsValid = chainState.count(2*chainID) != 0 &&
chainState[2*chainID].isInitialized() &&
chainState[2*chainID].getSize() == chainLength;
......@@ -6058,7 +6081,7 @@ double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context,
return *((float*) pinnedBuffer);
}
std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
std::pair<double, double> CommonIntegrateNoseHooverStepKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
......@@ -6153,7 +6176,7 @@ std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnerg
return KEs;
}
void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
void CommonIntegrateNoseHooverStepKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
// For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
// called before this kernel. If that ever ceases to be true, some sanity checks are needed here.
......@@ -6184,6 +6207,54 @@ void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const No
}
}
void CommonIntegrateNoseHooverStepKernel::createCheckpoint(ContextImpl& context, ostream& stream) const {
int numChains = chainState.size();
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
stream.write((char*) &numChains, sizeof(int));
for (auto& state : chainState){
int chainID = state.first;
int chainLength = state.second.getSize();
stream.write((char*) &chainID, sizeof(int));
stream.write((char*) &chainLength, sizeof(int));
if (useDouble) {
vector<mm_double2> stateVec;
state.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
}
else {
vector<mm_float2> stateVec;
state.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
}
}
}
void CommonIntegrateNoseHooverStepKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
int numChains;
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
stream.read((char*) &numChains, sizeof(int));
chainState.clear();
for (int i = 0; i < numChains; i++) {
int chainID, chainLength;
stream.read((char*) &chainID, sizeof(int));
stream.read((char*) &chainLength, sizeof(int));
if (useDouble) {
chainState[chainID] = ComputeArray();
chainState[chainID].initialize<mm_double2>(cc, chainLength, "chainState" + to_string(chainID));
vector<mm_double2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
chainState[chainID].upload(stateVec);
}
else {
chainState[chainID] = ComputeArray();
chainState[chainID].initialize<mm_float2>(cc, chainLength, "chainState" + to_string(chainID));
vector<mm_float2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
chainState[chainID].upload(stateVec);
}
}
}
void CommonIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
cc.initializeContexts();
cc.setAsCurrent();
......
......@@ -753,25 +753,6 @@ int IntegrationUtilities::prepareRandomNumbers(int numValues) {
}
void IntegrationUtilities::createCheckpoint(ostream& stream) {
int numChains = noseHooverChainState.size();
bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
stream.write((char*) &numChains, sizeof(int));
for (auto &chainState: noseHooverChainState){
int chainID = chainState.first;
int chainLength = chainState.second.getSize();
stream.write((char*) &chainID, sizeof(int));
stream.write((char*) &chainLength, sizeof(int));
if (useDouble) {
vector<mm_double2> stateVec;
chainState.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
}
else {
vector<mm_float2> stateVec;
chainState.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
}
}
if (!random.isInitialized())
return;
stream.write((char*) &randomPos, sizeof(int));
......@@ -784,29 +765,6 @@ void IntegrationUtilities::createCheckpoint(ostream& stream) {
}
void IntegrationUtilities::loadCheckpoint(istream& stream) {
int numChains;
bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
stream.read((char*) &numChains, sizeof(int));
noseHooverChainState.clear();
for (int i = 0; i < numChains; i++) {
int chainID, chainLength;
stream.read((char*) &chainID, sizeof(int));
stream.read((char*) &chainLength, sizeof(int));
if (useDouble) {
noseHooverChainState[chainID] = ComputeArray();
noseHooverChainState[chainID].initialize<mm_double2>(context, chainLength, "chainState" + to_string(chainID));
vector<mm_double2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
noseHooverChainState[chainID].upload(stateVec);
}
else {
noseHooverChainState[chainID] = ComputeArray();
noseHooverChainState[chainID].initialize<mm_float2>(context, chainLength, "chainState" + to_string(chainID));
vector<mm_float2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
noseHooverChainState[chainID].upload(stateVec);
}
}
if (!random.isInitialized())
return;
stream.read((char*) &randomPos, sizeof(int));
......
......@@ -61,8 +61,8 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
int2 pos = MAP_POS[MAPS[index]];
int size = pos.y;
real delta = 2*PI/size;
int s = (int) (angleA/delta);
int t = (int) (angleB/delta);
int s = (int) fmin(angleA/delta, size-1);
int t = (int) fmin(angleB/delta, size-1);
float4 c[4];
int coeffIndex = pos.x+4*(s+size*t);
c[0] = COEFF[coeffIndex];
......
......@@ -3,8 +3,8 @@ typedef double TempType;
typedef double3 TempType3;
typedef double4 TempType4;
#define make_TempType3(a...) make_double3(a)
#define make_TempType4(a...) make_double4(a)
#define make_TempType3(...) make_double3(__VA_ARGS__)
#define make_TempType4(...) make_double4(__VA_ARGS__)
#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
......@@ -16,8 +16,8 @@ typedef float TempType;
typedef float3 TempType3;
typedef float4 TempType4;
#define make_TempType3(a...) make_float3(a)
#define make_TempType4(a...) make_float4(a)
#define make_TempType3(...) make_float3(__VA_ARGS__)
#define make_TempType4(...) make_float4(__VA_ARGS__)
#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
#endif
......
......@@ -28,7 +28,7 @@ KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT v
) {
mixed vscale = paramBuffer[VelScale];
mixed noisescale = paramBuffer[NoiseScale];
mixed halfdt = 0.5*dt[0].y;
mixed halfdt = 0.5f*dt[0].y;
int index = GLOBAL_ID;
randomIndex += index;
while (index < numAtoms) {
......
// Propagates a Nose Hoover chain a full timestep
KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor,
GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS,
int numDOFs, float timeStep, mixed kT, float frequency){
......@@ -15,30 +16,28 @@ KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL c
for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP
mixed wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.5f * wdt * chainForces[bead]);
}
// update particle velocities
mixed aa = EXP(-0.5f * wdt * chainData[0].y);
scale *= aa;
scale *= (mixed) exp(-wdt * chainData[0].y);;
// update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt;
chainData[bead].x += chainData[bead].y * wdt;
}
// update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.5f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
END_YS_LOOP
} // MTS loop
if (chainType == 0) {
scaleFactor[0].x = scale;
} else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment