Commit 0b5d58d7 authored by Charlles Abreu's avatar Charlles Abreu
Browse files

Conflict resolution in TestSplineFilter.cpp

parents 9026dbe7 b0d13582
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#if defined(__ARM__) #if defined(__ARM__) || defined(__ARM64__)
#include "vectorize_neon.h" #include "vectorize_neon.h"
#elif defined(__PPC__) #elif defined(__PPC__)
#include "vectorize_ppc.h" #include "vectorize_ppc.h"
......
...@@ -46,11 +46,23 @@ class fvec8 { ...@@ -46,11 +46,23 @@ class fvec8 {
public: public:
__m256 val; __m256 val;
fvec8() {} fvec8() = default;
fvec8(float v) : val(_mm256_set1_ps(v)) {} fvec8(float v) : val(_mm256_set1_ps(v)) {}
fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {} fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
fvec8(__m256 v) : val(v) {} fvec8(__m256 v) : val(v) {}
fvec8(const float* v) : val(_mm256_loadu_ps(v)) {} fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
/** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec8(const float* table, const int idx[8]) {
// :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
// Use gather and static assert? Conditional code?
val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
}
operator __m256() const { operator __m256() const {
return val; return val;
} }
...@@ -115,6 +127,12 @@ public: ...@@ -115,6 +127,12 @@ public:
return _mm256_cmp_ps(val, other, _CMP_LE_OQ); return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
} }
operator ivec8() const; operator ivec8() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static fvec8 expandBitsToMask(int bitmask);
}; };
/** /**
...@@ -160,6 +178,19 @@ inline ivec8::operator fvec8() const { ...@@ -160,6 +178,19 @@ inline ivec8::operator fvec8() const {
return _mm256_cvtepi32_ps(val); return _mm256_cvtepi32_ps(val);
} }
inline fvec8 fvec8::expandBitsToMask(int bitmask) {
// Put a copy of bit 0 in the first element, bit 1 in the second, and so on.
const auto expandedBits =
_mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi8(bitmask)),
_mm256_castsi256_ps(_mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128)));
// The individual bits are essentially extremely small floating-point values. By comparing against zero
// (even a floating-point zero), the individual bits are turned into a complete element mask.
const auto elementMask = _mm256_cmp_ps(expandedBits, __m256(), _CMP_NEQ_OQ);
return elementMask;
}
// Functions that operate on fvec8s. // Functions that operate on fvec8s.
static inline fvec8 floor(const fvec8& v) { static inline fvec8 floor(const fvec8& v) {
...@@ -208,6 +239,11 @@ static inline float dot8(const fvec8& v1, const fvec8& v2) { ...@@ -208,6 +239,11 @@ static inline float dot8(const fvec8& v1, const fvec8& v2) {
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec()); return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
} }
static inline float reduceAdd(const fvec8 v) {
// :TODO: There are more efficient ways to do this.
return dot8(v, fvec8(1.0f));
}
static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) { static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4; fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8; fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
...@@ -231,6 +267,14 @@ static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in ...@@ -231,6 +267,14 @@ static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in
out4 = _mm256_insertf128_ps(out4, i8, 1); out4 = _mm256_insertf128_ps(out4, i8, 1);
} }
/** Given a vec4[8] input array, generate 4 vec8 outputs. The first output contains all the first elements
* the second output the second elements, and so on. Note that the prototype is essentially differing only
* in output type so it can be overloaded in other SIMD fvec types.
*/
static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
}
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) { static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
out1 = in1.lowerVec(); out1 = in1.lowerVec();
out2 = in2.lowerVec(); out2 = in2.lowerVec();
...@@ -244,6 +288,13 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in ...@@ -244,6 +288,13 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
_MM_TRANSPOSE4_PS(out5, out6, out7, out8); _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
} }
/**
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
*/
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
}
// Functions that operate on ivec8s. // Functions that operate on ivec8s.
static inline bool any(const ivec8& v) { static inline bool any(const ivec8& v) {
...@@ -268,10 +319,100 @@ static inline fvec8 operator/(float v1, const fvec8& v2) { ...@@ -268,10 +319,100 @@ static inline fvec8 operator/(float v1, const fvec8& v2) {
return fvec8(v1)/v2; return fvec8(v1)/v2;
} }
// Operations for blending fvec8s based on an ivec8. // Operation for blending fvec8 from a full bitmask.
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
}
static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
return blend(0.0f, v, mask);
}
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) { /**
return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val))); * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
const auto lower = index.lowerVec();
const auto upper = index.upperVec();
// Gather all the separate memory data together. Each vector will have two values
// which get used, and two which are ultimately discarded.
fvec4 t0(table + lower[0]);
fvec4 t1(table + lower[1]);
fvec4 t2(table + lower[2]);
fvec4 t3(table + lower[3]);
fvec4 t4(table + upper[0]);
fvec4 t5(table + upper[1]);
fvec4 t6(table + upper[2]);
fvec4 t7(table + upper[3]);
// Tranposing the 8 vectors above will put all the first elements into one output
// vector, all the second elements into the next vector and so on.
fvec8 discard0, discard1;
transpose(t0, t1, t2, t3, t4, t5, t6, t7, out0, out1, discard0, discard1);
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3 X4 X5 X6 X7
* Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
* Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + ...)
* output[1] = (Y0 + Y1 + Y2 + ...)
* output[2] = (Z0 + Z1 + Z2 + ...)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
// The general strategy for a vector reduce-add operation is to take values from
// different parts of the vector and overlap them a different part of the vector and then
// add together. Repeat this several times until all values have been summed. Initially 8
// values can be reduced to 4, 4 to 2, and 2 to 1. The following code essentially does this
// but exploits two things:
// - having multiple inputs means that some vectors can be combined together to amortise the
// cost of shuffling.
// - the output destinations are part of anther vector, so accumulate into the correct
// offsets to start with, instead of reducing to position 0 and re-inserting to the correct
// output location.
//
// As far as possible, accumulate x, y and z into their output positions in both the top and
// bottom 128-bits to exploit in-lane permutes as much as possible early on.
// Shuffle X and Z together to form one reduced vector.
// X2 X3 Z0 Z1 X6 X7 Z4 Z5
const auto xzshuf = _mm256_shuffle_ps(x, z, 0b01001110);
// Blend X and Z together to form another reduced vector, overlapping the previous.
// X0 X1 Z2 Z3 X4 X5 Z6 Z7
const auto xzblend = _mm256_blend_ps(x, z, 0b11001100);
// Add them together to form:
// (X0 + X2) (X1 + X3) (Z0 + Z2) (Z1 + Z3) etc.
const auto xz0 = _mm256_add_ps(xzshuf, xzblend);
// Now there's only one vector containing all values. Shuffle again to form another overlap,
// and then add.
const auto xz1 = _mm256_permute_ps(xz0, 0b00110001);
const auto xz2 = _mm256_add_ps(xz0, xz1);
// Work on Z on its own as there's nothing else to work with. Start by permuting it to
// form some overlaps, and then add:
// (Y0 + Y2) (Y1 + Y3) - - (Y4 + Y6) (Y5 + Y7) - -
const auto yshuf = _mm256_permute_ps(y, 0b11101110);
const auto y0 = _mm256_add_ps(yshuf, y);
// Shift the bottom float of each pair to the right, into the correct Y location.
const auto y1 = _mm256_permute_ps(y0, 0b00000000);
const auto y2 = _mm256_add_ps(y0, y1);
// Blend the results together to give a complete set of XYZ in the correct respective positions
// of both top and bottom 128-bit lanes.
const auto laneResult = fvec8(_mm256_blend_ps(xz2, y2, 0b00100010));
return laneResult.lowerVec() + laneResult.upperVec();
} }
#endif /*OPENMM_VECTORIZE8_H_*/ #endif /*OPENMM_VECTORIZE8_H_*/
...@@ -32,7 +32,12 @@ ...@@ -32,7 +32,12 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#ifdef __ANDROID__
#include <cpu-features.h> #include <cpu-features.h>
#else
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#include <arm_neon.h> #include <arm_neon.h>
#include <cmath> #include <cmath>
...@@ -48,8 +53,16 @@ float32x4_t log_ps(float32x4_t); ...@@ -48,8 +53,16 @@ float32x4_t log_ps(float32x4_t);
* Determine whether ivec4 and fvec4 are supported on this processor. * Determine whether ivec4 and fvec4 are supported on this processor.
*/ */
static bool isVec4Supported() { static bool isVec4Supported() {
#ifdef __ANDROID__
uint64_t features = android_getCpuFeatures(); uint64_t features = android_getCpuFeatures();
return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0; return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
#elif defined(__ARM__)
unsigned long features = getauxval(AT_HWCAP);
return (features & HWCAP_NEON) != 0;
#else
unsigned long features = getauxval(AT_HWCAP);
return (features & HWCAP_ASIMD) != 0;
#endif
} }
class ivec4; class ivec4;
...@@ -61,7 +74,7 @@ class fvec4 { ...@@ -61,7 +74,7 @@ class fvec4 {
public: public:
float32x4_t val; float32x4_t val;
fvec4() {} fvec4() = default;
fvec4(float v) : val(vdupq_n_f32(v)) {} fvec4(float v) : val(vdupq_n_f32(v)) {}
fvec4(float v1, float v2, float v3, float v4) { fvec4(float v1, float v2, float v3, float v4) {
float v[] = {v1, v2, v3, v4}; float v[] = {v1, v2, v3, v4};
...@@ -72,6 +85,16 @@ public: ...@@ -72,6 +85,16 @@ public:
operator float32x4_t() const { operator float32x4_t() const {
return val; return val;
} }
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
float operator[](int i) const { float operator[](int i) const {
switch (i) { switch (i) {
case 0: case 0:
...@@ -88,6 +111,16 @@ public: ...@@ -88,6 +111,16 @@ public:
void store(float* v) const { void store(float* v) const {
vst1q_f32(v, val); vst1q_f32(v, val);
} }
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = vgetq_lane_f32(val, 0);
v[1] = vgetq_lane_f32(val, 1);
v[2] = vgetq_lane_f32(val, 2);
}
fvec4 operator+(const fvec4& other) const { fvec4 operator+(const fvec4& other) const {
return vaddq_f32(val, other); return vaddq_f32(val, other);
} }
...@@ -146,6 +179,13 @@ public: ...@@ -146,6 +179,13 @@ public:
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
} }
operator ivec4() const; operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
}; };
/** /**
...@@ -241,6 +281,12 @@ inline ivec4::operator fvec4() const { ...@@ -241,6 +281,12 @@ inline ivec4::operator fvec4() const {
return fvec4(vcvtq_f32_s32(val)); return fvec4(vcvtq_f32_s32(val));
} }
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
...@@ -284,6 +330,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) { ...@@ -284,6 +330,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3); return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
} }
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
return fvec4(v1[1]*v2[2] - v1[2]*v2[1], return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
v1[2]*v2[0] - v1[0]*v2[2], v1[2]*v2[0] - v1[0]*v2[2],
...@@ -301,6 +351,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { ...@@ -301,6 +351,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = t4.val[1]; v4 = t4.val[1];
} }
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
...@@ -343,6 +409,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) { ...@@ -343,6 +409,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1); return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(const fvec4& v) {
...@@ -361,4 +431,38 @@ static inline fvec4 ceil(const fvec4& v) { ...@@ -361,4 +431,38 @@ static inline fvec4 ceil(const fvec4& v) {
return rounded + blend(0.0f, 1.0f, rounded<v); return rounded + blend(0.0f, 1.0f, rounded<v);
} }
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_NEON_H_*/ #endif /*OPENMM_VECTORIZE_NEON_H_*/
...@@ -56,7 +56,7 @@ class fvec4 { ...@@ -56,7 +56,7 @@ class fvec4 {
public: public:
__m128 val; __m128 val;
fvec4() {} fvec4() = default;
fvec4(float v) { fvec4(float v) {
val = {v, v, v, v}; val = {v, v, v, v};
} }
...@@ -67,6 +67,16 @@ public: ...@@ -67,6 +67,16 @@ public:
fvec4(const float* v) { fvec4(const float* v) {
val = *((__m128*) v); val = *((__m128*) v);
} }
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const { operator __m128() const {
return val; return val;
} }
...@@ -76,6 +86,15 @@ public: ...@@ -76,6 +86,15 @@ public:
void store(float* v) const { void store(float* v) const {
*((__m128*) v) = val; *((__m128*) v) = val;
} }
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = val[0];
v[1] = val[1];
v[2] = val[2];
}
fvec4 operator+(const fvec4& other) const { fvec4 operator+(const fvec4& other) const {
return val+other; return val+other;
} }
...@@ -116,6 +135,13 @@ public: ...@@ -116,6 +135,13 @@ public:
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(const fvec4& other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(const fvec4& other) const;
operator ivec4() const; operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
}; };
/** /**
...@@ -227,6 +253,13 @@ inline ivec4::operator fvec4() const { ...@@ -227,6 +253,13 @@ inline ivec4::operator fvec4() const {
return __builtin_convertvector(val, __m128); return __builtin_convertvector(val, __m128);
} }
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(const fvec4& v) {
...@@ -252,6 +285,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) { ...@@ -252,6 +285,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
__m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) - __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3); v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
...@@ -269,6 +306,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { ...@@ -269,6 +306,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7); v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7);
} }
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
...@@ -312,6 +365,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) ...@@ -312,6 +365,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1)); return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
...@@ -358,5 +415,40 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -358,5 +415,40 @@ static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v; return rsqrt(v)*v;
} }
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_PNACL_H_*/ #endif /*OPENMM_VECTORIZE_PNACL_H_*/
...@@ -57,7 +57,7 @@ class fvec4 { ...@@ -57,7 +57,7 @@ class fvec4 {
public: public:
__m128 val; __m128 val;
fvec4() {} fvec4() = default;
fvec4(float v) { fvec4(float v) {
val = (__m128) {v, v, v, v}; val = (__m128) {v, v, v, v};
} }
...@@ -68,6 +68,16 @@ public: ...@@ -68,6 +68,16 @@ public:
fvec4(const float* v) { fvec4(const float* v) {
val = *((__m128*) v); val = *((__m128*) v);
} }
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const { operator __m128() const {
return val; return val;
} }
...@@ -77,6 +87,16 @@ public: ...@@ -77,6 +87,16 @@ public:
void store(float* v) const { void store(float* v) const {
*((__m128*) v) = val; *((__m128*) v) = val;
} }
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
v[0] = val[0];
v[1] = val[1];
v[2] = val[2];
}
fvec4 operator+(const fvec4& other) const { fvec4 operator+(const fvec4& other) const {
return vec_add(val, other.val); return vec_add(val, other.val);
} }
...@@ -117,6 +137,13 @@ public: ...@@ -117,6 +137,13 @@ public:
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(const fvec4& other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(const fvec4& other) const;
operator ivec4() const; operator ivec4() const;
/***
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static ivec4 expandBitsToMask(int bitmask);
}; };
/** /**
...@@ -228,6 +255,13 @@ inline ivec4::operator fvec4() const { ...@@ -228,6 +255,13 @@ inline ivec4::operator fvec4() const {
return (__m128) {(float)val[0], (float)val[1], (float)val[2], (float)val[3]}; return (__m128) {(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
} }
inline ivec4 fvec4::expandBitsToMask(int bitmask) {
return ivec4(bitmask & 1 ? -1 : 0,
bitmask & 2 ? -1 : 0,
bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0);
}
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(const fvec4& v) {
...@@ -253,6 +287,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) { ...@@ -253,6 +287,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15}; vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
__m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) - __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
...@@ -275,6 +313,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { ...@@ -275,6 +313,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
v4 = vec_perm(a2, a4, perm4); v4 = vec_perm(a2, a4, perm4);
} }
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
...@@ -289,8 +343,8 @@ static inline ivec4 abs(const ivec4& v) { ...@@ -289,8 +343,8 @@ static inline ivec4 abs(const ivec4& v) {
return vec_abs(v.val); return vec_abs(v.val);
} }
static inline bool any(const __m128i& v) { static inline bool any(const ivec4 v) {
return !vec_all_eq(v, ivec4(0).val); return !vec_all_eq(v.val, ivec4(0).val);
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
...@@ -317,6 +371,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) ...@@ -317,6 +371,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val); return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
return blend(0.0f, v, mask);
}
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
...@@ -355,5 +413,39 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -355,5 +413,39 @@ static inline fvec4 sqrt(const fvec4& v) {
return vec_sqrt(v.val); return vec_sqrt(v.val);
} }
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
}
#endif /*OPENMM_VECTORIZE_PPC_H_*/ #endif /*OPENMM_VECTORIZE_PPC_H_*/
...@@ -32,7 +32,12 @@ ...@@ -32,7 +32,12 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#ifdef __AVX__
#include <immintrin.h>
#else
#include <smmintrin.h> #include <smmintrin.h>
#endif
#include "hardware.h" #include "hardware.h"
// This file defines classes and functions to simplify vectorizing code with SSE. // This file defines classes and functions to simplify vectorizing code with SSE.
...@@ -63,11 +68,21 @@ class fvec4 { ...@@ -63,11 +68,21 @@ class fvec4 {
public: public:
__m128 val; __m128 val;
fvec4() {} fvec4() = default;
fvec4(float v) : val(_mm_set1_ps(v)) {} fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {} fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {} fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {} fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
/**
* Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvec4(const float* table, const int idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const { operator __m128() const {
return val; return val;
} }
...@@ -79,6 +94,20 @@ public: ...@@ -79,6 +94,20 @@ public:
void store(float* v) const { void store(float* v) const {
_mm_storeu_ps(v, val); _mm_storeu_ps(v, val);
} }
/**
* Store only the lower three elements of the vector.
*/
void storeVec3(float* v) const {
// This code could be called from objects compiled for better SIMD domains (e.g., AVX) so conditionally
// compile in the most efficient variant of the instruction.
#ifdef __AVX__
_mm_maskstore_ps(v, _mm_setr_epi32(-1, -1, -1, 0), val);
#else
_mm_maskmoveu_si128 (_mm_castps_si128(val), _mm_setr_epi32(-1, -1, -1, 0), (char*)v);
#endif
}
fvec4 operator+(const fvec4& other) const { fvec4 operator+(const fvec4& other) const {
return _mm_add_ps(val, other); return _mm_add_ps(val, other);
} }
...@@ -131,6 +160,12 @@ public: ...@@ -131,6 +160,12 @@ public:
return _mm_cmple_ps(val, other); return _mm_cmple_ps(val, other);
} }
operator ivec4() const; operator ivec4() const;
/**
* Convert an integer bitmask into a full vector of elements which can be used
* by the blend function.
*/
static fvec4 expandBitsToMask(int bitmask);
}; };
/** /**
...@@ -214,6 +249,13 @@ inline ivec4::operator fvec4() const { ...@@ -214,6 +249,13 @@ inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val); return _mm_cvtepi32_ps(val);
} }
inline fvec4 fvec4::expandBitsToMask(int bitmask) {
// Not optimal for SSE (see AVX implementation for better version)
// but useful as an example for other SIMD architectures.
const auto values = fvec4(bitmask & 1, bitmask & 2, bitmask & 4, bitmask & 8);
return values != fvec4(0.0f);
}
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(const fvec4& v) {
...@@ -273,6 +315,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) { ...@@ -273,6 +315,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
} }
static inline float reduceAdd(const fvec4 v) {
return dot4(v, fvec4(1.0f));
}
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) - fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1)))); fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
...@@ -283,6 +329,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { ...@@ -283,6 +329,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
_MM_TRANSPOSE4_PS(v1, v2, v3, v4); _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
} }
/**
* Out-of-place transpose from an array into named variables.
*/
static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
transpose(v0, v1, v2, v3);
}
/**
* Out-of-place transpose from named variables into an array.
*/
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]);
}
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
...@@ -319,10 +381,48 @@ static inline fvec4 operator/(float v1, const fvec4& v2) { ...@@ -319,10 +381,48 @@ static inline fvec4 operator/(float v1, const fvec4& v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
}
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) { static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val))); return blend(0.0f, v, mask);
}
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]);
fvec4 t3(table + index[3]);
transpose(t0, t1, t2, t3);
out0 = t0;
out1 = t1;
}
/**
* Given 3 vectors of floating-point data, reduce them to a single 3-element position
* value by adding all the elements in each vector. Given inputs of:
* X0 X1 X2 X3
* Y0 Y1 Y2 Y3
* Z0 Z1 Z2 Z3
* Each vector of values needs to be summed into a single value, and then stored into
* the output vector:
* output[0] = (X0 + X1 + X2 + X3)
* output[1] = (Y0 + Y1 + Y2 + Y3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
*/
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
// :TODO: Could be made more efficient.
const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z);
return fvec4(nx, ny, nz, 0.0);
} }
#endif /*OPENMM_VECTORIZE_SSE_H_*/ #endif /*OPENMM_VECTORIZE_SSE_H_*/
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2015-2016 Stanford University and the Authors. * * Portions copyright (c) 2015-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -128,3 +128,15 @@ void CompoundIntegrator::stateChanged(State::DataType changed) { ...@@ -128,3 +128,15 @@ void CompoundIntegrator::stateChanged(State::DataType changed) {
double CompoundIntegrator::computeKineticEnergy() { double CompoundIntegrator::computeKineticEnergy() {
return integrators[currentIntegrator]->computeKineticEnergy(); return integrators[currentIntegrator]->computeKineticEnergy();
} }
void CompoundIntegrator::createCheckpoint(std::ostream& stream) const {
stream.write((char*) &currentIntegrator, sizeof(int));
for (int i = 0; i < integrators.size(); i++)
integrators[i]->createCheckpoint(stream);
}
void CompoundIntegrator::loadCheckpoint(std::istream& stream) {
stream.read((char*) &currentIntegrator, sizeof(int));
for (int i = 0; i < integrators.size(); i++)
integrators[i]->loadCheckpoint(stream);
}
...@@ -454,6 +454,7 @@ void ContextImpl::createCheckpoint(ostream& stream) { ...@@ -454,6 +454,7 @@ void ContextImpl::createCheckpoint(ostream& stream) {
stream.write((char*) &param.second, sizeof(double)); stream.write((char*) &param.second, sizeof(double));
} }
updateStateDataKernel.getAs<UpdateStateDataKernel>().createCheckpoint(*this, stream); updateStateDataKernel.getAs<UpdateStateDataKernel>().createCheckpoint(*this, stream);
integrator.createCheckpoint(stream);
stream.flush(); stream.flush();
} }
...@@ -480,6 +481,7 @@ void ContextImpl::loadCheckpoint(istream& stream) { ...@@ -480,6 +481,7 @@ void ContextImpl::loadCheckpoint(istream& stream) {
parameters[name] = value; parameters[name] = value;
} }
updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream); updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream);
integrator.loadCheckpoint(stream);
hasSetPositions = true; hasSetPositions = true;
integrator.stateChanged(State::Positions); integrator.stateChanged(State::Positions);
integrator.stateChanged(State::Velocities); integrator.stateChanged(State::Velocities);
......
...@@ -114,6 +114,31 @@ bool CustomIntegrator::kineticEnergyRequiresForce() const { ...@@ -114,6 +114,31 @@ bool CustomIntegrator::kineticEnergyRequiresForce() const {
return keNeedsForce; return keNeedsForce;
} }
void CustomIntegrator::createCheckpoint(std::ostream& stream) const {
for (int i = 0; i < getNumGlobalVariables(); i++) {
double value = getGlobalVariable(i);
stream.write((char*) &value, sizeof(double));
}
vector<Vec3> values;
for (int i = 0; i < getNumPerDofVariables(); i++) {
getPerDofVariable(i, values);
stream.write((char*) values.data(), sizeof(Vec3)*values.size());
}
}
void CustomIntegrator::loadCheckpoint(std::istream& stream) {
double value;
for (int i = 0; i < getNumGlobalVariables(); i++) {
stream.read((char*) &value, sizeof(double));
setGlobalVariable(i, value);
}
vector<Vec3> values(context->getSystem().getNumParticles());
for (int i = 0; i < getNumPerDofVariables(); i++) {
stream.read((char*) values.data(), sizeof(Vec3)*values.size());
setPerDofVariable(i, values);
}
}
void CustomIntegrator::step(int steps) { void CustomIntegrator::step(int steps) {
if (context == NULL) if (context == NULL)
throw OpenMMException("This Integrator is not bound to a context!"); throw OpenMMException("This Integrator is not bound to a context!");
......
...@@ -55,7 +55,10 @@ std::vector<double> NoseHooverChain::getYoshidaSuzukiWeights() const { ...@@ -55,7 +55,10 @@ std::vector<double> NoseHooverChain::getYoshidaSuzukiWeights() const {
case 5: case 5:
return {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, return {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065,
0.2967324292201065}; 0.2967324292201065};
case 7:
return {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,
-1.17767998417887, 0.235573213359357, 0.784513610477560};
default: default:
throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3, or 5."); throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3,5, or 7.");
} }
} }
...@@ -68,11 +68,6 @@ NoseHooverIntegrator::NoseHooverIntegrator(double temperature, double collisionF ...@@ -68,11 +68,6 @@ NoseHooverIntegrator::NoseHooverIntegrator(double temperature, double collisionF
NoseHooverIntegrator::~NoseHooverIntegrator() {} NoseHooverIntegrator::~NoseHooverIntegrator() {}
std::pair<double, double> NoseHooverIntegrator::propagateChain(std::pair<double, double> kineticEnergy, int chainID) {
return nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, noseHooverChains.at(chainID), kineticEnergy, getStepSize());
}
int NoseHooverIntegrator::addThermostat(double temperature, double collisionFrequency, int NoseHooverIntegrator::addThermostat(double temperature, double collisionFrequency,
int chainLength, int numMTS, int numYoshidaSuzuki) { int chainLength, int numMTS, int numYoshidaSuzuki) {
hasSubsystemThermostats_ = false; hasSubsystemThermostats_ = false;
...@@ -271,10 +266,10 @@ double NoseHooverIntegrator::computeKineticEnergy() { ...@@ -271,10 +266,10 @@ double NoseHooverIntegrator::computeKineticEnergy() {
double kE = 0.0; double kE = 0.0;
if(noseHooverChains.size() > 0) { if(noseHooverChains.size() > 0) {
for (const auto &nhc: noseHooverChains){ for (const auto &nhc: noseHooverChains){
kE += nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true).first; kE += kernel.getAs<IntegrateNoseHooverStepKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
} }
} else { } else {
kE = vvKernel.getAs<IntegrateVelocityVerletStepKernel>().computeKineticEnergy(*context, *this); kE = kernel.getAs<IntegrateNoseHooverStepKernel>().computeKineticEnergy(*context, *this);
} }
return kE; return kE;
} }
...@@ -287,7 +282,7 @@ double NoseHooverIntegrator::computeHeatBathEnergy() { ...@@ -287,7 +282,7 @@ double NoseHooverIntegrator::computeHeatBathEnergy() {
double energy = 0; double energy = 0;
for(auto &nhc : noseHooverChains) { for(auto &nhc : noseHooverChains) {
if (context && (nhc.getNumDegreesOfFreedom() > 0)) { if (context && (nhc.getNumDegreesOfFreedom() > 0)) {
energy += nhcKernel.getAs<NoseHooverChainKernel>().computeHeatBathEnergy(*context, nhc); energy += kernel.getAs<IntegrateNoseHooverStepKernel>().computeHeatBathEnergy(*context, nhc);
} }
} }
return energy; return energy;
...@@ -300,10 +295,8 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) { ...@@ -300,10 +295,8 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
context = &contextRef; context = &contextRef;
const System& system = context->getSystem(); const System& system = context->getSystem();
owner = &contextRef.getOwner(); owner = &contextRef.getOwner();
vvKernel = context->getPlatform().createKernel(IntegrateVelocityVerletStepKernel::Name(), contextRef); kernel = context->getPlatform().createKernel(IntegrateNoseHooverStepKernel::Name(), contextRef);
vvKernel.getAs<IntegrateVelocityVerletStepKernel>().initialize(contextRef.getSystem(), *this); kernel.getAs<IntegrateNoseHooverStepKernel>().initialize(contextRef.getSystem(), *this);
nhcKernel = context->getPlatform().createKernel(NoseHooverChainKernel::Name(), contextRef);
nhcKernel.getAs<NoseHooverChainKernel>().initialize();
forcesAreValid = false; forcesAreValid = false;
// check for drude particles and build the Nose-Hoover Chains // check for drude particles and build the Nose-Hoover Chains
...@@ -329,34 +322,30 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) { ...@@ -329,34 +322,30 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
} }
void NoseHooverIntegrator::cleanup() { void NoseHooverIntegrator::cleanup() {
vvKernel = Kernel(); kernel = Kernel();
nhcKernel = Kernel();
} }
vector<string> NoseHooverIntegrator::getKernelNames() { vector<string> NoseHooverIntegrator::getKernelNames() {
std::vector<std::string> names; std::vector<std::string> names;
names.push_back(NoseHooverChainKernel::Name()); names.push_back(IntegrateNoseHooverStepKernel::Name());
names.push_back(IntegrateVelocityVerletStepKernel::Name());
return names; return names;
} }
void NoseHooverIntegrator::step(int steps) { void NoseHooverIntegrator::step(int steps) {
if (context == NULL) if (context == NULL)
throw OpenMMException("This Integrator is not bound to a context!"); throw OpenMMException("This Integrator is not bound to a context!");
std::pair<double, double> scale, kineticEnergy;
for (int i = 0; i < steps; ++i) { for (int i = 0; i < steps; ++i) {
if(context->updateContextState()) if(context->updateContextState())
forcesAreValid = false; forcesAreValid = false;
for(auto &nhc : noseHooverChains) { context->calcForcesAndEnergy(true, false);
kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false); kernel.getAs<IntegrateNoseHooverStepKernel>().execute(*context, *this, forcesAreValid);
scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
}
vvKernel.getAs<IntegrateVelocityVerletStepKernel>().execute(*context, *this, forcesAreValid);
for(auto &nhc : noseHooverChains) {
kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
}
} }
} }
void NoseHooverIntegrator::createCheckpoint(std::ostream& stream) const {
kernel.getAs<IntegrateNoseHooverStepKernel>().createCheckpoint(*context, stream);
}
void NoseHooverIntegrator::loadCheckpoint(std::istream& stream) {
kernel.getAs<IntegrateNoseHooverStepKernel>().loadCheckpoint(*context, stream);
}
\ No newline at end of file
...@@ -86,8 +86,8 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do ...@@ -86,8 +86,8 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
// Create the system of equations to solve. // Create the system of equations to solve.
vector<double> a(n), b(n), c(n), rhs(n); vector<double> a(n-1), b(n-1), c(n-1), rhs(n-1);
a[0] = 0.0; a[0] = x[n-1]-x[n-2];
b[0] = 2.0*(x[1]-x[0]+x[n-1]-x[n-2]); b[0] = 2.0*(x[1]-x[0]+x[n-1]-x[n-2]);
c[0] = x[1]-x[0]; c[0] = x[1]-x[0];
rhs[0] = 6.0*((y[1]-y[0])/(x[1]-x[0]) - (y[n-1]-y[n-2])/(x[n-1]-x[n-2])); rhs[0] = 6.0*((y[1]-y[0])/(x[1]-x[0]) - (y[n-1]-y[n-2])/(x[n-1]-x[n-2]));
...@@ -97,17 +97,14 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do ...@@ -97,17 +97,14 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
c[i] = x[i+1]-x[i]; c[i] = x[i+1]-x[i];
rhs[i] = 6.0*((y[i+1]-y[i])/(x[i+1]-x[i]) - (y[i]-y[i-1])/(x[i]-x[i-1])); rhs[i] = 6.0*((y[i+1]-y[i])/(x[i+1]-x[i]) - (y[i]-y[i-1])/(x[i]-x[i-1]));
} }
a[n-1] = 0.0; double beta = a[0];
b[n-1] = 1.0; double alpha = c[n-2];
c[n-1] = 0.0;
rhs[n-1] = 0.0;
double beta = x[n-1]-x[n-2];
double alpha = -1.0;
double gamma = -b[0]; double gamma = -b[0];
// This is a cyclic tridiagonal matrix. We solve it using the Sherman-Morrison method, // This is a cyclic tridiagonal matrix. We solve it using the Sherman-Morrison method,
// which involves solving two tridiagonal systems. // which involves solving two tridiagonal systems.
n--;
b[0] -= gamma; b[0] -= gamma;
b[n-1] -= alpha*beta/gamma; b[n-1] -= alpha*beta/gamma;
solveTridiagonalMatrix(a, b, c, rhs, deriv); solveTridiagonalMatrix(a, b, c, rhs, deriv);
...@@ -118,6 +115,7 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do ...@@ -118,6 +115,7 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
double scale = (deriv[0]+beta*deriv[n-1]/gamma)/(1.0+z[0]+beta*z[n-1]/gamma); double scale = (deriv[0]+beta*deriv[n-1]/gamma)/(1.0+z[0]+beta*z[n-1]/gamma);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
deriv[i] -= scale*z[i]; deriv[i] -= scale*z[i];
deriv[n] = deriv[0];
} }
double SplineFitter::evaluateSpline(const vector<double>& x, const vector<double>& y, const vector<double>& deriv, double t) { double SplineFitter::evaluateSpline(const vector<double>& x, const vector<double>& y, const vector<double>& deriv, double t) {
......
...@@ -944,11 +944,13 @@ private: ...@@ -944,11 +944,13 @@ private:
/* /*
* This kernel is invoked by NoseHooverIntegrator to take one time step. * This kernel is invoked by NoseHooverIntegrator to take one time step.
*/ */
class CommonIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel { class CommonIntegrateNoseHooverStepKernel : public IntegrateNoseHooverStepKernel {
public: public:
CommonIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonIntegrateNoseHooverStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
IntegrateVelocityVerletStepKernel(name, platform), cc(cc), hasInitializedKernels(false) { } IntegrateNoseHooverStepKernel(name, platform), cc(cc), hasInitializedKernels(false),
~CommonIntegrateVelocityVerletStepKernel() {} hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
hasInitializedScaleVelocitiesKernel(false), hasInitializedPropagateKernel(false) {}
~CommonIntegrateNoseHooverStepKernel() {}
/** /**
* Initialize the kernel. * Initialize the kernel.
* *
...@@ -972,39 +974,16 @@ public: ...@@ -972,39 +974,16 @@ public:
* @param integrator the NoseHooverIntegrator this kernel is being used for * @param integrator the NoseHooverIntegrator this kernel is being used for
*/ */
double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator); double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
private:
ComputeContext& cc;
float prevMaxPairDistance;
ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
ComputeKernel kernel1, kernel2, kernel3, kernelHardWall;
bool hasInitializedKernels;
};
/**
* This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
* and update the associated particle velocities.
*/
class CommonNoseHooverChainKernel : public NoseHooverChainKernel {
public:
CommonNoseHooverChainKernel(std::string name, const Platform& platform, ComputeContext& cc) :
NoseHooverChainKernel(name, platform), cc(cc), hasInitializedPropagateKernel(false),
hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
hasInitializedScaleVelocitiesKernel(false) {}
~CommonNoseHooverChainKernel() {}
/**
* Initialize the kernel.
*/
void initialize();
/** /**
* Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor. * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
* *
* @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
* @param noseHooverChain the object describing the chain to be propagated. * @param noseHooverChain the object describing the chain to be propagated.
* @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain. * @param kineticEnergy the {center of mass, relative} kineticEnergies of the particles being thermostated by this chain.
* @param timeStep the time step used by the integrator. * @param timeStep the time step used by the integrator.
* @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath. * @return the velocity scale factor to apply to the particles associated with this heat bath.
*/ */
std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep); std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> kineticEnergy, double timeStep);
/** /**
* Execute the kernal that computes the total (kinetic + potential) heat bath energy. * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
* *
...@@ -1012,7 +991,7 @@ public: ...@@ -1012,7 +991,7 @@ public:
* @param noseHooverChain the chain whose energy is to be determined. * @param noseHooverChain the chain whose energy is to be determined.
* @return the total heat bath energy. * @return the total heat bath energy.
*/ */
double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc); double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain);
/** /**
* Execute the kernel that computes the kinetic energy for a subset of atoms, * Execute the kernel that computes the kinetic energy for a subset of atoms,
* or the relative kinetic energy of Drude particles with respect to their parent atoms * or the relative kinetic energy of Drude particles with respect to their parent atoms
...@@ -1020,22 +999,37 @@ public: ...@@ -1020,22 +999,37 @@ public:
* @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
* @param noseHooverChain the chain whose energy is to be determined. * @param noseHooverChain the chain whose energy is to be determined.
* @param downloadValue whether the computed value should be downloaded and returned. * @param downloadValue whether the computed value should be downloaded and returned.
*
*/ */
std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue); std::pair<double, double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
/** /**
* Execute the kernel that scales the velocities of particles associated with a nose hoover chain * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
* *
* @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
* @param noseHooverChain the chain whose energy is to be determined. * @param noseHooverChain the chain whose energy is to be determined.
* @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled. * @param scaleFactor the multiplicative factor by which {absolute, relative} velocities are scaled.
*/ */
void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors); void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactor);
/**
* Write the chain states to a checkpoint.
*/
void createCheckpoint(ContextImpl& context, std::ostream& stream) const;
/**
* Load the chain states from a checkpoint.
*/
void loadCheckpoint(ContextImpl& context, std::istream& stream);
private: private:
int sumWorkGroupSize;
ComputeContext& cc; ComputeContext& cc;
float prevMaxPairDistance;
ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer, oldDelta;
std::map<int, ComputeArray> chainState;
ComputeKernel kernel1, kernel2, kernel3, kernel4, kernelHardWall;
bool hasInitializedKernels;
ComputeKernel reduceEnergyKernel;
ComputeKernel computeHeatBathEnergyKernel;
ComputeKernel computeAtomsKineticEnergyKernel;
ComputeKernel computePairsKineticEnergyKernel;
ComputeKernel scaleAtomsVelocitiesKernel;
ComputeKernel scalePairsVelocitiesKernel;
ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy; ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
std::map<int, ComputeArray> atomlists, pairlists; std::map<int, ComputeArray> atomlists, pairlists;
std::map<int, ComputeKernel> propagateKernels; std::map<int, ComputeKernel> propagateKernels;
...@@ -1043,12 +1037,6 @@ private: ...@@ -1043,12 +1037,6 @@ private:
bool hasInitializedKineticEnergyKernel; bool hasInitializedKineticEnergyKernel;
bool hasInitializedHeatBathEnergyKernel; bool hasInitializedHeatBathEnergyKernel;
bool hasInitializedScaleVelocitiesKernel; bool hasInitializedScaleVelocitiesKernel;
ComputeKernel reduceEnergyKernel;
ComputeKernel computeHeatBathEnergyKernel;
ComputeKernel computeAtomsKineticEnergyKernel;
ComputeKernel computePairsKineticEnergyKernel;
ComputeKernel scaleAtomsVelocitiesKernel;
ComputeKernel scalePairsVelocitiesKernel;
}; };
/** /**
......
...@@ -130,12 +130,6 @@ public: ...@@ -130,12 +130,6 @@ public:
* @param timeShift the amount by which to shift the velocities in time * @param timeShift the amount by which to shift the velocities in time
*/ */
double computeKineticEnergy(double timeShift); double computeKineticEnergy(double timeShift);
/**
* Get the data structure that holds the state of all Nose-Hoover chains
*/
std::map<int, ComputeArray>& getNoseHooverChainState() {
return noseHooverChainState;
}
protected: protected:
virtual void applyConstraintsImpl(bool constrainVelocities, double tol) = 0; virtual void applyConstraintsImpl(bool constrainVelocities, double tol) = 0;
ComputeContext& context; ComputeContext& context;
...@@ -174,7 +168,6 @@ protected: ...@@ -174,7 +168,6 @@ protected:
ComputeArray vsiteLocalCoordsWeights; ComputeArray vsiteLocalCoordsWeights;
ComputeArray vsiteLocalCoordsPos; ComputeArray vsiteLocalCoordsPos;
ComputeArray vsiteLocalCoordsStartIndex; ComputeArray vsiteLocalCoordsStartIndex;
std::map<int, ComputeArray> noseHooverChainState;
int randomPos, lastSeed, numVsites; int randomPos, lastSeed, numVsites;
bool hasOverlappingVsites; bool hasOverlappingVsites;
mm_double2 lastStepSize; mm_double2 lastStepSize;
......
...@@ -5640,20 +5640,68 @@ double CommonIntegrateLangevinMiddleStepKernel::computeKineticEnergy(ContextImpl ...@@ -5640,20 +5640,68 @@ double CommonIntegrateLangevinMiddleStepKernel::computeKineticEnergy(ContextImpl
return cc.getIntegrationUtilities().computeKineticEnergy(0.0); return cc.getIntegrationUtilities().computeKineticEnergy(0.0);
} }
void CommonIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) { void CommonIntegrateNoseHooverStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
cc.initializeContexts(); cc.initializeContexts();
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
map<string, string> defines; map<string, string> defines;
defines["BOLTZ"] = cc.doubleToString(BOLTZ); defines["BOLTZ"] = cc.doubleToString(BOLTZ);
ComputeProgram program = cc.compileProgram(CommonKernelSources::velocityVerlet, defines); ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverIntegrator, defines);
kernel1 = program->createKernel("integrateVelocityVerletPart1"); kernel1 = program->createKernel("integrateNoseHooverMiddlePart1");
kernel2 = program->createKernel("integrateVelocityVerletPart2"); kernel2 = program->createKernel("integrateNoseHooverMiddlePart2");
kernel3 = program->createKernel("integrateVelocityVerletPart3"); kernel3 = program->createKernel("integrateNoseHooverMiddlePart3");
kernelHardWall = program->createKernel("integrateVelocityVerletHardWall"); kernel4 = program->createKernel("integrateNoseHooverMiddlePart4");
if (useDouble) {
oldDelta.initialize<mm_double4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
} else {
oldDelta.initialize<mm_float4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
}
kernelHardWall = program->createKernel("integrateNoseHooverHardWall");
prevMaxPairDistance = -1.0f; prevMaxPairDistance = -1.0f;
maxPairDistanceBuffer.initialize<float>(cc, 1, "maxPairDistanceBuffer"); maxPairDistanceBuffer.initialize<float>(cc, 1, "maxPairDistanceBuffer");
int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0};"
"for(int i=0;i<1;++i) {"
"const real ys = arr[i];";
defines["END_YS_LOOP"] = "}";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751};"
"for(int i=0;i<3;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065};"
"for(int i=0;i<5;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[7] = {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,-1.17767998417887, 0.235573213359357, 0.784513610477560};"
"for(int i=0;i<7;++i) {"
"const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[7] = program->createKernel("propagateNoseHooverChain");
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
reduceEnergyKernel = program->createKernel("reduceEnergyPair");
computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
int energyBufferSize = cc.getEnergyBuffer().getSize();
if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
else
energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
} }
void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) { void CommonIntegrateNoseHooverStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
IntegrationUtilities& integration = cc.getIntegrationUtilities(); IntegrationUtilities& integration = cc.getIntegrationUtilities();
int paddedNumAtoms = cc.getPaddedNumAtoms(); int paddedNumAtoms = cc.getPaddedNumAtoms();
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
...@@ -5700,32 +5748,39 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons ...@@ -5700,32 +5748,39 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
pairListBuffer.upload(tmp); pairListBuffer.upload(tmp);
pairTemperatureBuffer.upload(tmp2); pairTemperatureBuffer.upload(tmp2);
} }
int totalAtoms = cc.getNumAtoms();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
kernel1->addArg(numAtoms); kernel1->addArg(numAtoms);
kernel1->addArg(numPairs); kernel1->addArg(numPairs);
kernel1->addArg(paddedNumAtoms); kernel1->addArg(paddedNumAtoms);
kernel1->addArg(cc.getIntegrationUtilities().getStepSize());
kernel1->addArg(cc.getPosq());
kernel1->addArg(cc.getVelm()); kernel1->addArg(cc.getVelm());
kernel1->addArg(cc.getLongForceBuffer()); kernel1->addArg(cc.getLongForceBuffer());
kernel1->addArg(integration.getPosDelta()); kernel1->addArg(integration.getStepSize());
kernel1->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0 kernel1->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
kernel1->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0 kernel1->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
if (cc.getUseMixedPrecision()) kernel2->addArg(totalAtoms);
kernel1->addArg(cc.getPosqCorrection());
kernel2->addArg(numParticles);
kernel2->addArg(cc.getIntegrationUtilities().getStepSize());
kernel2->addArg(cc.getPosq());
kernel2->addArg(cc.getVelm()); kernel2->addArg(cc.getVelm());
kernel2->addArg(integration.getPosDelta()); kernel2->addArg(integration.getPosDelta());
kernel2->addArg(oldDelta);
kernel2->addArg(integration.getStepSize());
kernel3->addArg(totalAtoms);
kernel3->addArg(cc.getVelm());
kernel3->addArg(integration.getPosDelta());
kernel3->addArg(oldDelta);
kernel3->addArg(integration.getStepSize());
kernel4->addArg(totalAtoms);
kernel4->addArg(cc.getPosq());
kernel4->addArg(cc.getVelm());
kernel4->addArg(integration.getPosDelta());
kernel4->addArg(oldDelta);
kernel4->addArg(integration.getStepSize());
if (cc.getUseMixedPrecision()) if (cc.getUseMixedPrecision())
kernel2->addArg(cc.getPosqCorrection()); kernel4->addArg(cc.getPosqCorrection());
if (numPairs > 0) { if (numPairs > 0) {
kernelHardWall->addArg(numPairs); kernelHardWall->addArg(numPairs);
kernelHardWall->addArg(maxPairDistanceBuffer); kernelHardWall->addArg(maxPairDistanceBuffer);
kernelHardWall->addArg(cc.getIntegrationUtilities().getStepSize()); kernelHardWall->addArg(integration.getStepSize());
kernelHardWall->addArg(cc.getPosq()); kernelHardWall->addArg(cc.getPosq());
kernelHardWall->addArg(cc.getVelm()); kernelHardWall->addArg(cc.getVelm());
kernelHardWall->addArg(pairListBuffer); kernelHardWall->addArg(pairListBuffer);
...@@ -5733,78 +5788,50 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons ...@@ -5733,78 +5788,50 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
if (cc.getUseMixedPrecision()) if (cc.getUseMixedPrecision())
kernelHardWall->addArg(cc.getPosqCorrection()); kernelHardWall->addArg(cc.getPosqCorrection());
} }
kernel3->addArg(numAtoms);
kernel3->addArg(numPairs);
kernel3->addArg(paddedNumAtoms);
kernel3->addArg(cc.getIntegrationUtilities().getStepSize());
kernel3->addArg(cc.getPosq());
kernel3->addArg(cc.getVelm());
kernel3->addArg(cc.getLongForceBuffer());
kernel3->addArg(integration.getPosDelta());
kernel3->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
kernel3->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
if (cc.getUseMixedPrecision())
kernel3->addArg(cc.getPosqCorrection());
} }
/* /*
* Carry out the integration * Carry out the LF-middle integration (c.f. J. Phys. Chem. A 2019, 123, 6056−6079)
*/ */
// Velocity update
// Advance the velocities a half step
kernel1->execute(std::max(numAtoms, numPairs)); kernel1->execute(std::max(numAtoms, numPairs));
integration.applyConstraints(integrator.getConstraintTolerance()); integration.applyVelocityConstraints(integrator.getConstraintTolerance());
// Advance particle positions a full step // Position update
kernel2->execute(numParticles); kernel2->execute(numParticles);
// Apply the thermostat
int numChains = integrator.getNumThermostats();
for(int chain = 0; chain < numChains; ++chain) {
const auto &thermostatChain = integrator.getThermostat(chain);
auto KEs = computeMaskedKineticEnergy(context, thermostatChain, false);
auto scaleFactors = propagateChain(context, thermostatChain, KEs, dt);
scaleVelocities(context, thermostatChain, scaleFactors);
}
// Position update
kernel3->execute(numParticles);
integration.applyConstraints(integrator.getConstraintTolerance());
// Apply constraint forces
kernel4->execute(numAtoms);
// Make sure any Drude-like particles have not wandered too far from home // Make sure any Drude-like particles have not wandered too far from home
if (numPairs > 0) kernelHardWall->execute(numPairs); if (numPairs > 0) kernelHardWall->execute(numPairs);
integration.computeVirtualSites(); integration.computeVirtualSites();
context.calcForcesAndEnergy(true, false);
forcesAreValid = true;
// Update velocities another half step
kernel3->execute(std::max(numAtoms, numPairs));
integration.applyVelocityConstraints(integrator.getConstraintTolerance());
// Update the time and step count.
cc.setTime(cc.getTime()+dt); cc.setTime(cc.getTime()+dt);
cc.setStepCount(cc.getStepCount()+1); cc.setStepCount(cc.getStepCount()+1);
cc.reorderAtoms(); cc.reorderAtoms();
// Reduce UI lag.
#ifdef WIN32
cc.flushQueue();
#endif
} }
double CommonIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) { double CommonIntegrateNoseHooverStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
return cc.getIntegrationUtilities().computeKineticEnergy(0); return cc.getIntegrationUtilities().computeKineticEnergy(0);
} }
void CommonNoseHooverChainKernel::initialize() {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
map<string, string> defines;
int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0}; for(int i=0;i<1;++i) { const real ys = arr[i];";
defines["END_YS_LOOP"] = "}";
ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751}; for(int i=0;i<3;++i) { const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}; for(int i=0;i<5;++i) { const real ys = arr[i];";
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
reduceEnergyKernel = program->createKernel("reduceEnergyPair");
computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
int energyBufferSize = cc.getEnergyBuffer().getSize();
if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
else
energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
}
std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) { std::pair<double, double> CommonIntegrateNoseHooverStepKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision(); bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
int chainID = nhc.getChainID(); int chainID = nhc.getChainID();
int nAtoms = nhc.getThermostatedAtoms().size(); int nAtoms = nhc.getThermostatedAtoms().size();
...@@ -5813,12 +5840,10 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp ...@@ -5813,12 +5840,10 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
int numYS = nhc.getNumYoshidaSuzukiTimeSteps(); int numYS = nhc.getNumYoshidaSuzukiTimeSteps();
int numMTS = nhc.getNumMultiTimeSteps(); int numMTS = nhc.getNumMultiTimeSteps();
if (numYS != 1 && numYS != 3 && numYS != 5) { if (numYS != 1 && numYS != 3 && numYS != 5 && numYS != 7) {
throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5."); throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, 5, or 7.");
} }
auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) { if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) {
if (useDouble) { if (useDouble) {
std::vector<mm_double2> zeros{{0,0}}; std::vector<mm_double2> zeros{{0,0}};
...@@ -5973,15 +5998,13 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp ...@@ -5973,15 +5998,13 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
return {0, 0}; return {0, 0};
} }
double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) { double CommonIntegrateNoseHooverStepKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision(); bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
int chainID = nhc.getChainID(); int chainID = nhc.getChainID();
int chainLength = nhc.getChainLength(); int chainLength = nhc.getChainLength();
auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
bool absChainIsValid = chainState.count(2*chainID) != 0 && bool absChainIsValid = chainState.count(2*chainID) != 0 &&
chainState[2*chainID].isInitialized() && chainState[2*chainID].isInitialized() &&
chainState[2*chainID].getSize() == chainLength; chainState[2*chainID].getSize() == chainLength;
...@@ -6058,7 +6081,7 @@ double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, ...@@ -6058,7 +6081,7 @@ double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context,
return *((float*) pinnedBuffer); return *((float*) pinnedBuffer);
} }
std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) { std::pair<double, double> CommonIntegrateNoseHooverStepKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision(); bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
...@@ -6153,7 +6176,7 @@ std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnerg ...@@ -6153,7 +6176,7 @@ std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnerg
return KEs; return KEs;
} }
void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) { void CommonIntegrateNoseHooverStepKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
// For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
// called before this kernel. If that ever ceases to be true, some sanity checks are needed here. // called before this kernel. If that ever ceases to be true, some sanity checks are needed here.
...@@ -6184,6 +6207,54 @@ void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const No ...@@ -6184,6 +6207,54 @@ void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const No
} }
} }
void CommonIntegrateNoseHooverStepKernel::createCheckpoint(ContextImpl& context, ostream& stream) const {
int numChains = chainState.size();
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
stream.write((char*) &numChains, sizeof(int));
for (auto& state : chainState){
int chainID = state.first;
int chainLength = state.second.getSize();
stream.write((char*) &chainID, sizeof(int));
stream.write((char*) &chainLength, sizeof(int));
if (useDouble) {
vector<mm_double2> stateVec;
state.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
}
else {
vector<mm_float2> stateVec;
state.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
}
}
}
void CommonIntegrateNoseHooverStepKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
int numChains;
bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
stream.read((char*) &numChains, sizeof(int));
chainState.clear();
for (int i = 0; i < numChains; i++) {
int chainID, chainLength;
stream.read((char*) &chainID, sizeof(int));
stream.read((char*) &chainLength, sizeof(int));
if (useDouble) {
chainState[chainID] = ComputeArray();
chainState[chainID].initialize<mm_double2>(cc, chainLength, "chainState" + to_string(chainID));
vector<mm_double2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
chainState[chainID].upload(stateVec);
}
else {
chainState[chainID] = ComputeArray();
chainState[chainID].initialize<mm_float2>(cc, chainLength, "chainState" + to_string(chainID));
vector<mm_float2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
chainState[chainID].upload(stateVec);
}
}
}
void CommonIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) { void CommonIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
cc.initializeContexts(); cc.initializeContexts();
cc.setAsCurrent(); cc.setAsCurrent();
......
...@@ -753,25 +753,6 @@ int IntegrationUtilities::prepareRandomNumbers(int numValues) { ...@@ -753,25 +753,6 @@ int IntegrationUtilities::prepareRandomNumbers(int numValues) {
} }
void IntegrationUtilities::createCheckpoint(ostream& stream) { void IntegrationUtilities::createCheckpoint(ostream& stream) {
int numChains = noseHooverChainState.size();
bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
stream.write((char*) &numChains, sizeof(int));
for (auto &chainState: noseHooverChainState){
int chainID = chainState.first;
int chainLength = chainState.second.getSize();
stream.write((char*) &chainID, sizeof(int));
stream.write((char*) &chainLength, sizeof(int));
if (useDouble) {
vector<mm_double2> stateVec;
chainState.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
}
else {
vector<mm_float2> stateVec;
chainState.second.download(stateVec);
stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
}
}
if (!random.isInitialized()) if (!random.isInitialized())
return; return;
stream.write((char*) &randomPos, sizeof(int)); stream.write((char*) &randomPos, sizeof(int));
...@@ -784,29 +765,6 @@ void IntegrationUtilities::createCheckpoint(ostream& stream) { ...@@ -784,29 +765,6 @@ void IntegrationUtilities::createCheckpoint(ostream& stream) {
} }
void IntegrationUtilities::loadCheckpoint(istream& stream) { void IntegrationUtilities::loadCheckpoint(istream& stream) {
int numChains;
bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
stream.read((char*) &numChains, sizeof(int));
noseHooverChainState.clear();
for (int i = 0; i < numChains; i++) {
int chainID, chainLength;
stream.read((char*) &chainID, sizeof(int));
stream.read((char*) &chainLength, sizeof(int));
if (useDouble) {
noseHooverChainState[chainID] = ComputeArray();
noseHooverChainState[chainID].initialize<mm_double2>(context, chainLength, "chainState" + to_string(chainID));
vector<mm_double2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
noseHooverChainState[chainID].upload(stateVec);
}
else {
noseHooverChainState[chainID] = ComputeArray();
noseHooverChainState[chainID].initialize<mm_float2>(context, chainLength, "chainState" + to_string(chainID));
vector<mm_float2> stateVec(chainLength);
stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
noseHooverChainState[chainID].upload(stateVec);
}
}
if (!random.isInitialized()) if (!random.isInitialized())
return; return;
stream.read((char*) &randomPos, sizeof(int)); stream.read((char*) &randomPos, sizeof(int));
......
...@@ -61,8 +61,8 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI); ...@@ -61,8 +61,8 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
int2 pos = MAP_POS[MAPS[index]]; int2 pos = MAP_POS[MAPS[index]];
int size = pos.y; int size = pos.y;
real delta = 2*PI/size; real delta = 2*PI/size;
int s = (int) (angleA/delta); int s = (int) fmin(angleA/delta, size-1);
int t = (int) (angleB/delta); int t = (int) fmin(angleB/delta, size-1);
float4 c[4]; float4 c[4];
int coeffIndex = pos.x+4*(s+size*t); int coeffIndex = pos.x+4*(s+size*t);
c[0] = COEFF[coeffIndex]; c[0] = COEFF[coeffIndex];
......
...@@ -3,8 +3,8 @@ typedef double TempType; ...@@ -3,8 +3,8 @@ typedef double TempType;
typedef double3 TempType3; typedef double3 TempType3;
typedef double4 TempType4; typedef double4 TempType4;
#define make_TempType3(a...) make_double3(a) #define make_TempType3(...) make_double3(__VA_ARGS__)
#define make_TempType4(a...) make_double4(a) #define make_TempType4(...) make_double4(__VA_ARGS__)
#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z) #define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w) #define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
...@@ -16,8 +16,8 @@ typedef float TempType; ...@@ -16,8 +16,8 @@ typedef float TempType;
typedef float3 TempType3; typedef float3 TempType3;
typedef float4 TempType4; typedef float4 TempType4;
#define make_TempType3(a...) make_float3(a) #define make_TempType3(...) make_float3(__VA_ARGS__)
#define make_TempType4(a...) make_float4(a) #define make_TempType4(...) make_float4(__VA_ARGS__)
#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z) #define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w) #define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
#endif #endif
......
...@@ -28,7 +28,7 @@ KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT v ...@@ -28,7 +28,7 @@ KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT v
) { ) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
mixed halfdt = 0.5*dt[0].y; mixed halfdt = 0.5f*dt[0].y;
int index = GLOBAL_ID; int index = GLOBAL_ID;
randomIndex += index; randomIndex += index;
while (index < numAtoms) { while (index < numAtoms) {
......
// Propagates a Nose Hoover chain a full timestep
KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor, KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor,
GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS, GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS,
int numDOFs, float timeStep, mixed kT, float frequency){ int numDOFs, float timeStep, mixed kT, float frequency){
...@@ -15,30 +16,28 @@ KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL c ...@@ -15,30 +16,28 @@ KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL c
for (int mts = 0; mts < numMTS; ++mts) { for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP BEGIN_YS_LOOP
mixed wdt = ys * timeOverMTS; mixed wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1]; chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) { for (int bead = chainLength - 2; bead >= 0; --bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y); mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]); chainData[bead].y = aa * (chainData[bead].y * aa + 0.5f * wdt * chainForces[bead]);
} }
// update particle velocities // update particle velocities
mixed aa = EXP(-0.5f * wdt * chainData[0].y); scale *= (mixed) exp(-wdt * chainData[0].y);;
scale *= aa;
// update the thermostat positions // update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) { for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt; chainData[bead].x += chainData[bead].y * wdt;
} }
// update the forces // update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0]; chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities // update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) { for (int bead = 0; bead < chainLength - 1; ++bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y); mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]); chainData[bead].y = aa * (aa * chainData[bead].y + 0.5f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1]; chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
} }
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1]; chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
END_YS_LOOP END_YS_LOOP
} // MTS loop } // MTS loop
if (chainType == 0) { if (chainType == 0) {
scaleFactor[0].x = scale; scaleFactor[0].x = scale;
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment