Commit 818dc73b authored by peastman's avatar peastman
Browse files

Merge pull request #84 from peastman/cache

Cache the PTX files for compiled kernels
parents 6144a88b bf34e947
...@@ -73,7 +73,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT}) ...@@ -73,7 +73,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT})
# The source is organized into subdirectories, but we handle them all from # The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS. # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert platforms/reference libraries/validate) SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference libraries/validate)
IF(WIN32) IF(WIN32)
SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads) SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads)
ADD_CUSTOM_TARGET(PthreadsLibraries ALL) ADD_CUSTOM_TARGET(PthreadsLibraries ALL)
......
/*
100% free public domain implementation of the SHA-1 algorithm
by Dominik Reichl <dominik.reichl@t-online.de>
Web: http://www.dominik-reichl.de/
Version 2.1 - 2012-06-19
- Deconstructor (resetting internal variables) is now only
implemented if SHA1_WIPE_VARIABLES is defined (which is the
default).
- Renamed inclusion guard to contain a GUID.
- Demo application is now using C++/STL objects and functions.
- Unicode build of the demo application now outputs the hashes of both
the ANSI and Unicode representations of strings.
- Various other demo application improvements.
Version 2.0 - 2012-06-14
- Added 'limits.h' include.
- Renamed inclusion guard and macros for compliancy (names beginning
with an underscore are reserved).
Version 1.9 - 2011-11-10
- Added Unicode test vectors.
- Improved support for hashing files using the HashFile method that
are larger than 4 GB.
- Improved file hashing performance (by using a larger buffer).
- Disabled unnecessary compiler warnings.
- Internal variables are now private.
Version 1.8 - 2009-03-16
- Converted project files to Visual Studio 2008 format.
- Added Unicode support for HashFile utility method.
- Added support for hashing files using the HashFile method that are
larger than 2 GB.
- HashFile now returns an error code instead of copying an error
message into the output buffer.
- GetHash now returns an error code and validates the input parameter.
- Added ReportHashStl STL utility method.
- Added REPORT_HEX_SHORT reporting mode.
- Improved Linux compatibility of test program.
Version 1.7 - 2006-12-21
- Fixed buffer underrun warning that appeared when compiling with
Borland C Builder (thanks to Rex Bloom and Tim Gallagher for the
patch).
- Breaking change: ReportHash writes the final hash to the start
of the buffer, i.e. it's not appending it to the string anymore.
- Made some function parameters const.
- Added Visual Studio 2005 project files to demo project.
Version 1.6 - 2005-02-07 (thanks to Howard Kapustein for patches)
- You can set the endianness in your files, no need to modify the
header file of the CSHA1 class anymore.
- Aligned data support.
- Made support/compilation of the utility functions (ReportHash and
HashFile) optional (useful when bytes count, for example in embedded
environments).
Version 1.5 - 2005-01-01
- 64-bit compiler compatibility added.
- Made variable wiping optional (define SHA1_WIPE_VARIABLES).
- Removed unnecessary variable initializations.
- ROL32 improvement for the Microsoft compiler (using _rotl).
Version 1.4 - 2004-07-22
- CSHA1 now compiles fine with GCC 3.3 under Mac OS X (thanks to Larry
Hastings).
Version 1.3 - 2003-08-17
- Fixed a small memory bug and made a buffer array a class member to
ensure correct working when using multiple CSHA1 class instances at
one time.
Version 1.2 - 2002-11-16
- Borlands C++ compiler seems to have problems with string addition
using sprintf. Fixed the bug which caused the digest report function
not to work properly. CSHA1 is now Borland compatible.
Version 1.1 - 2002-10-11
- Removed two unnecessary header file includes and changed BOOL to
bool. Fixed some minor bugs in the web page contents.
Version 1.0 - 2002-06-20
- First official release.
================ Test Vectors ================
SHA1("abc" in ANSI) =
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
SHA1("abc" in Unicode LE) =
9F04F41A 84851416 2050E3D6 8C1A7ABB 441DC2B5
SHA1("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
in ANSI) =
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
SHA1("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
in Unicode LE) =
51D7D876 9AC72C40 9C5B0E3F 69C60ADC 9A039014
SHA1(A million repetitions of "a" in ANSI) =
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
SHA1(A million repetitions of "a" in Unicode LE) =
C4609560 A108A0C6 26AA7F2B 38A65566 739353C5
*/
#ifndef SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
#define SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
#if !defined(SHA1_UTILITY_FUNCTIONS) && !defined(SHA1_NO_UTILITY_FUNCTIONS)
#define SHA1_UTILITY_FUNCTIONS
#endif
#if !defined(SHA1_STL_FUNCTIONS) && !defined(SHA1_NO_STL_FUNCTIONS)
#define SHA1_STL_FUNCTIONS
#if !defined(SHA1_UTILITY_FUNCTIONS)
#error STL functions require SHA1_UTILITY_FUNCTIONS.
#endif
#endif
#include <memory.h>
#include <limits.h>
#ifdef SHA1_UTILITY_FUNCTIONS
#include <stdio.h>
#include <string.h>
#endif
#ifdef SHA1_STL_FUNCTIONS
#include <string>
#endif
#ifdef _MSC_VER
#include <stdlib.h>
#endif
// You can define the endian mode in your files without modifying the SHA-1
// source files. Just #define SHA1_LITTLE_ENDIAN or #define SHA1_BIG_ENDIAN
// in your files, before including the SHA1.h header file. If you don't
// define anything, the class defaults to little endian.
#if !defined(SHA1_LITTLE_ENDIAN) && !defined(SHA1_BIG_ENDIAN)
#define SHA1_LITTLE_ENDIAN
#endif
// If you want variable wiping, #define SHA1_WIPE_VARIABLES, if not,
// #define SHA1_NO_WIPE_VARIABLES. If you don't define anything, it
// defaults to wiping.
#if !defined(SHA1_WIPE_VARIABLES) && !defined(SHA1_NO_WIPE_VARIABLES)
#define SHA1_WIPE_VARIABLES
#endif
#if defined(SHA1_HAS_TCHAR)
#include <tchar.h>
#else
#ifdef _MSC_VER
#include <tchar.h>
#else
#ifndef TCHAR
#define TCHAR char
#endif
#ifndef _T
#define _T(__x) (__x)
#define _tmain main
#define _tprintf printf
#define _getts gets
#define _tcslen strlen
#define _tfopen fopen
#define _tcscpy strcpy
#define _tcscat strcat
#define _sntprintf snprintf
#endif
#endif
#endif
///////////////////////////////////////////////////////////////////////////
// Define variable types
#ifndef UINT_8
#ifdef _MSC_VER // Compiling with Microsoft compiler
#define UINT_8 unsigned __int8
#else // !_MSC_VER
#define UINT_8 unsigned char
#endif // _MSC_VER
#endif
#ifndef UINT_32
#ifdef _MSC_VER // Compiling with Microsoft compiler
#define UINT_32 unsigned __int32
#else // !_MSC_VER
#if (ULONG_MAX == 0xFFFFFFFFUL)
#define UINT_32 unsigned long
#else
#define UINT_32 unsigned int
#endif
#endif // _MSC_VER
#endif // UINT_32
#ifndef INT_64
#ifdef _MSC_VER // Compiling with Microsoft compiler
#define INT_64 __int64
#else // !_MSC_VER
#define INT_64 long long
#endif // _MSC_VER
#endif // INT_64
#ifndef UINT_64
#ifdef _MSC_VER // Compiling with Microsoft compiler
#define UINT_64 unsigned __int64
#else // !_MSC_VER
#define UINT_64 unsigned long long
#endif // _MSC_VER
#endif // UINT_64
///////////////////////////////////////////////////////////////////////////
// Declare SHA-1 workspace
typedef union
{
UINT_8 c[64];
UINT_32 l[16];
} SHA1_WORKSPACE_BLOCK;
class CSHA1
{
public:
#ifdef SHA1_UTILITY_FUNCTIONS
// Different formats for ReportHash(Stl)
enum REPORT_TYPE
{
REPORT_HEX = 0,
REPORT_DIGIT = 1,
REPORT_HEX_SHORT = 2
};
#endif
// Constructor and destructor
CSHA1();
#ifdef SHA1_WIPE_VARIABLES
~CSHA1();
#endif
void Reset();
// Hash in binary data and strings
void Update(const UINT_8* pbData, UINT_32 uLen);
#ifdef SHA1_UTILITY_FUNCTIONS
// Hash in file contents
bool HashFile(const TCHAR* tszFileName);
#endif
// Finalize hash; call it before using ReportHash(Stl)
void Final();
#ifdef SHA1_UTILITY_FUNCTIONS
bool ReportHash(TCHAR* tszReport, REPORT_TYPE rtReportType = REPORT_HEX) const;
#endif
#ifdef SHA1_STL_FUNCTIONS
bool ReportHashStl(std::basic_string<TCHAR>& strOut, REPORT_TYPE rtReportType =
REPORT_HEX) const;
#endif
// Get the raw message digest (20 bytes)
bool GetHash(UINT_8* pbDest20) const;
private:
// Private SHA-1 transformation
void Transform(UINT_32* pState, const UINT_8* pBuffer);
// Member variables
UINT_32 m_state[5];
UINT_32 m_count[2];
UINT_32 m_reserved0[1]; // Memory alignment padding
UINT_8 m_buffer[64];
UINT_8 m_digest[20];
UINT_32 m_reserved1[3]; // Memory alignment padding
UINT_8 m_workspace[64];
SHA1_WORKSPACE_BLOCK* m_block; // SHA1 pointer to the byte array above
};
#endif // SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
/*
100% free public domain implementation of the SHA-1 algorithm
by Dominik Reichl <dominik.reichl@t-online.de>
Web: http://www.dominik-reichl.de/
See header file for version history and test vectors.
*/
// If compiling with MFC, you might want to add #include "StdAfx.h"
#define _CRT_SECURE_NO_WARNINGS
#include "SHA1.h"
#define SHA1_MAX_FILE_BUFFER (32 * 20 * 820)
// Rotate p_val32 by p_nBits bits to the left
#ifndef ROL32
#ifdef _MSC_VER
#define ROL32(p_val32,p_nBits) _rotl(p_val32,p_nBits)
#else
#define ROL32(p_val32,p_nBits) (((p_val32)<<(p_nBits))|((p_val32)>>(32-(p_nBits))))
#endif
#endif
#ifdef SHA1_LITTLE_ENDIAN
#define SHABLK0(i) (m_block->l[i] = \
(ROL32(m_block->l[i],24) & 0xFF00FF00) | (ROL32(m_block->l[i],8) & 0x00FF00FF))
#else
#define SHABLK0(i) (m_block->l[i])
#endif
#define SHABLK(i) (m_block->l[i&15] = ROL32(m_block->l[(i+13)&15] ^ \
m_block->l[(i+8)&15] ^ m_block->l[(i+2)&15] ^ m_block->l[i&15],1))
// SHA-1 rounds
#define S_R0(v,w,x,y,z,i) {z+=((w&(x^y))^y)+SHABLK0(i)+0x5A827999+ROL32(v,5);w=ROL32(w,30);}
#define S_R1(v,w,x,y,z,i) {z+=((w&(x^y))^y)+SHABLK(i)+0x5A827999+ROL32(v,5);w=ROL32(w,30);}
#define S_R2(v,w,x,y,z,i) {z+=(w^x^y)+SHABLK(i)+0x6ED9EBA1+ROL32(v,5);w=ROL32(w,30);}
#define S_R3(v,w,x,y,z,i) {z+=(((w|x)&y)|(w&x))+SHABLK(i)+0x8F1BBCDC+ROL32(v,5);w=ROL32(w,30);}
#define S_R4(v,w,x,y,z,i) {z+=(w^x^y)+SHABLK(i)+0xCA62C1D6+ROL32(v,5);w=ROL32(w,30);}
#pragma warning(push)
// Disable compiler warning 'Conditional expression is constant'
#pragma warning(disable: 4127)
CSHA1::CSHA1()
{
m_block = (SHA1_WORKSPACE_BLOCK*)m_workspace;
Reset();
}
#ifdef SHA1_WIPE_VARIABLES
CSHA1::~CSHA1()
{
Reset();
}
#endif
void CSHA1::Reset()
{
// SHA1 initialization constants
m_state[0] = 0x67452301;
m_state[1] = 0xEFCDAB89;
m_state[2] = 0x98BADCFE;
m_state[3] = 0x10325476;
m_state[4] = 0xC3D2E1F0;
m_count[0] = 0;
m_count[1] = 0;
}
void CSHA1::Transform(UINT_32* pState, const UINT_8* pBuffer)
{
UINT_32 a = pState[0], b = pState[1], c = pState[2], d = pState[3], e = pState[4];
memcpy(m_block, pBuffer, 64);
// 4 rounds of 20 operations each, loop unrolled
S_R0(a,b,c,d,e, 0); S_R0(e,a,b,c,d, 1); S_R0(d,e,a,b,c, 2); S_R0(c,d,e,a,b, 3);
S_R0(b,c,d,e,a, 4); S_R0(a,b,c,d,e, 5); S_R0(e,a,b,c,d, 6); S_R0(d,e,a,b,c, 7);
S_R0(c,d,e,a,b, 8); S_R0(b,c,d,e,a, 9); S_R0(a,b,c,d,e,10); S_R0(e,a,b,c,d,11);
S_R0(d,e,a,b,c,12); S_R0(c,d,e,a,b,13); S_R0(b,c,d,e,a,14); S_R0(a,b,c,d,e,15);
S_R1(e,a,b,c,d,16); S_R1(d,e,a,b,c,17); S_R1(c,d,e,a,b,18); S_R1(b,c,d,e,a,19);
S_R2(a,b,c,d,e,20); S_R2(e,a,b,c,d,21); S_R2(d,e,a,b,c,22); S_R2(c,d,e,a,b,23);
S_R2(b,c,d,e,a,24); S_R2(a,b,c,d,e,25); S_R2(e,a,b,c,d,26); S_R2(d,e,a,b,c,27);
S_R2(c,d,e,a,b,28); S_R2(b,c,d,e,a,29); S_R2(a,b,c,d,e,30); S_R2(e,a,b,c,d,31);
S_R2(d,e,a,b,c,32); S_R2(c,d,e,a,b,33); S_R2(b,c,d,e,a,34); S_R2(a,b,c,d,e,35);
S_R2(e,a,b,c,d,36); S_R2(d,e,a,b,c,37); S_R2(c,d,e,a,b,38); S_R2(b,c,d,e,a,39);
S_R3(a,b,c,d,e,40); S_R3(e,a,b,c,d,41); S_R3(d,e,a,b,c,42); S_R3(c,d,e,a,b,43);
S_R3(b,c,d,e,a,44); S_R3(a,b,c,d,e,45); S_R3(e,a,b,c,d,46); S_R3(d,e,a,b,c,47);
S_R3(c,d,e,a,b,48); S_R3(b,c,d,e,a,49); S_R3(a,b,c,d,e,50); S_R3(e,a,b,c,d,51);
S_R3(d,e,a,b,c,52); S_R3(c,d,e,a,b,53); S_R3(b,c,d,e,a,54); S_R3(a,b,c,d,e,55);
S_R3(e,a,b,c,d,56); S_R3(d,e,a,b,c,57); S_R3(c,d,e,a,b,58); S_R3(b,c,d,e,a,59);
S_R4(a,b,c,d,e,60); S_R4(e,a,b,c,d,61); S_R4(d,e,a,b,c,62); S_R4(c,d,e,a,b,63);
S_R4(b,c,d,e,a,64); S_R4(a,b,c,d,e,65); S_R4(e,a,b,c,d,66); S_R4(d,e,a,b,c,67);
S_R4(c,d,e,a,b,68); S_R4(b,c,d,e,a,69); S_R4(a,b,c,d,e,70); S_R4(e,a,b,c,d,71);
S_R4(d,e,a,b,c,72); S_R4(c,d,e,a,b,73); S_R4(b,c,d,e,a,74); S_R4(a,b,c,d,e,75);
S_R4(e,a,b,c,d,76); S_R4(d,e,a,b,c,77); S_R4(c,d,e,a,b,78); S_R4(b,c,d,e,a,79);
// Add the working vars back into state
pState[0] += a;
pState[1] += b;
pState[2] += c;
pState[3] += d;
pState[4] += e;
// Wipe variables
#ifdef SHA1_WIPE_VARIABLES
a = b = c = d = e = 0;
#endif
}
void CSHA1::Update(const UINT_8* pbData, UINT_32 uLen)
{
UINT_32 j = ((m_count[0] >> 3) & 0x3F);
if((m_count[0] += (uLen << 3)) < (uLen << 3))
++m_count[1]; // Overflow
m_count[1] += (uLen >> 29);
UINT_32 i;
if((j + uLen) > 63)
{
i = 64 - j;
memcpy(&m_buffer[j], pbData, i);
Transform(m_state, m_buffer);
for( ; (i + 63) < uLen; i += 64)
Transform(m_state, &pbData[i]);
j = 0;
}
else i = 0;
if((uLen - i) != 0)
memcpy(&m_buffer[j], &pbData[i], uLen - i);
}
#ifdef SHA1_UTILITY_FUNCTIONS
bool CSHA1::HashFile(const TCHAR* tszFileName)
{
if(tszFileName == NULL) return false;
FILE* fpIn = _tfopen(tszFileName, _T("rb"));
if(fpIn == NULL) return false;
UINT_8* pbData = new UINT_8[SHA1_MAX_FILE_BUFFER];
if(pbData == NULL) { fclose(fpIn); return false; }
bool bSuccess = true;
while(true)
{
const size_t uRead = fread(pbData, 1, SHA1_MAX_FILE_BUFFER, fpIn);
if(uRead > 0)
Update(pbData, static_cast<UINT_32>(uRead));
if(uRead < SHA1_MAX_FILE_BUFFER)
{
if(feof(fpIn) == 0) bSuccess = false;
break;
}
}
fclose(fpIn);
delete[] pbData;
return bSuccess;
}
#endif
void CSHA1::Final()
{
UINT_32 i;
UINT_8 pbFinalCount[8];
for(i = 0; i < 8; ++i)
pbFinalCount[i] = static_cast<UINT_8>((m_count[((i >= 4) ? 0 : 1)] >>
((3 - (i & 3)) * 8) ) & 0xFF); // Endian independent
Update((UINT_8*)"\200", 1);
while((m_count[0] & 504) != 448)
Update((UINT_8*)"\0", 1);
Update(pbFinalCount, 8); // Cause a Transform()
for(i = 0; i < 20; ++i)
m_digest[i] = static_cast<UINT_8>((m_state[i >> 2] >> ((3 -
(i & 3)) * 8)) & 0xFF);
// Wipe variables for security reasons
#ifdef SHA1_WIPE_VARIABLES
memset(m_buffer, 0, 64);
memset(m_state, 0, 20);
memset(m_count, 0, 8);
memset(pbFinalCount, 0, 8);
Transform(m_state, m_buffer);
#endif
}
#ifdef SHA1_UTILITY_FUNCTIONS
bool CSHA1::ReportHash(TCHAR* tszReport, REPORT_TYPE rtReportType) const
{
if(tszReport == NULL) return false;
TCHAR tszTemp[16];
if((rtReportType == REPORT_HEX) || (rtReportType == REPORT_HEX_SHORT))
{
_sntprintf(tszTemp, 15, _T("%02X"), m_digest[0]);
_tcscpy(tszReport, tszTemp);
const TCHAR* lpFmt = ((rtReportType == REPORT_HEX) ? _T(" %02X") : _T("%02X"));
for(size_t i = 1; i < 20; ++i)
{
_sntprintf(tszTemp, 15, lpFmt, m_digest[i]);
_tcscat(tszReport, tszTemp);
}
}
else if(rtReportType == REPORT_DIGIT)
{
_sntprintf(tszTemp, 15, _T("%u"), m_digest[0]);
_tcscpy(tszReport, tszTemp);
for(size_t i = 1; i < 20; ++i)
{
_sntprintf(tszTemp, 15, _T(" %u"), m_digest[i]);
_tcscat(tszReport, tszTemp);
}
}
else return false;
return true;
}
#endif
#ifdef SHA1_STL_FUNCTIONS
bool CSHA1::ReportHashStl(std::basic_string<TCHAR>& strOut, REPORT_TYPE rtReportType) const
{
TCHAR tszOut[84];
const bool bResult = ReportHash(tszOut, rtReportType);
if(bResult) strOut = tszOut;
return bResult;
}
#endif
bool CSHA1::GetHash(UINT_8* pbDest20) const
{
if(pbDest20 == NULL) return false;
memcpy(pbDest20, m_digest, 20);
return true;
}
#pragma warning(pop)
\ No newline at end of file
...@@ -515,7 +515,7 @@ private: ...@@ -515,7 +515,7 @@ private:
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered; bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered;
std::string compiler, tempDir, gpuArchitecture; std::string compiler, tempDir, cacheDir, gpuArchitecture;
float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat; float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxSize, invPeriodicBoxSize; double4 periodicBoxSize, invPeriodicBoxSize;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "CudaIntegrationUtilities.h" #include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h" #include "CudaNonbondedUtilities.h"
#include "SHA1.h"
#include "hilbert.h" #include "hilbert.h"
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
#include "openmm/Platform.h" #include "openmm/Platform.h"
...@@ -44,6 +45,7 @@ ...@@ -44,6 +45,7 @@
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <fstream> #include <fstream>
#include <iomanip>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <typeinfo> #include <typeinfo>
...@@ -90,10 +92,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -90,10 +92,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
} }
else else
throw OpenMMException("Illegal value for CudaPrecision: "+precision); throw OpenMMException("Illegal value for CudaPrecision: "+precision);
char* cacheVariable = getenv("OPENMM_CACHE_DIR");
cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
#ifdef WIN32 #ifdef WIN32
this->tempDir = tempDir+"\\"; this->tempDir = tempDir+"\\";
cacheDir = cacheDir+"\\";
#else #else
this->tempDir = tempDir+"/"; this->tempDir = tempDir+"/";
cacheDir = cacheDir+"/";
#endif #endif
contextIndex = platformData.contexts.size(); contextIndex = platformData.contexts.size();
int numDevices; int numDevices;
...@@ -350,6 +356,7 @@ static bool compileInWindows(const string &command) { ...@@ -350,6 +356,7 @@ static bool compileInWindows(const string &command) {
#endif #endif
CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) { CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags)); string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src; stringstream src;
if (!options.empty()) if (!options.empty())
...@@ -397,6 +404,23 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -397,6 +404,23 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << endl; src << endl;
src << source << endl; src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
UINT_8 hash[20];
sha1.GetHash(hash);
stringstream cacheFile;
cacheFile << cacheDir;
cacheFile.flags(ios::hex);
for (int i = 0; i < 20; i++)
cacheFile << setw(2) << setfill('0') << (int) hash[i];
cacheFile << '_' << gpuArchitecture << '_' << bits;
CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Write out the source to a temporary file. // Write out the source to a temporary file.
stringstream tempFileName; stringstream tempFileName;
...@@ -412,7 +436,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -412,7 +436,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
ofstream out(inputFile.c_str()); ofstream out(inputFile.c_str());
out << src.str(); out << src.str();
out.close(); out.close();
string bits = intToString(8*sizeof(void*));
#ifdef WIN32 #ifdef WIN32
#ifdef _DEBUG #ifdef _DEBUG
string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile; string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
...@@ -441,7 +464,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -441,7 +464,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
} }
throw OpenMMException(error.str()); throw OpenMMException(error.str());
} }
CUmodule module;
CUresult result = cuModuleLoad(&module, outputFile.c_str()); CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream m; std::stringstream m;
...@@ -449,6 +471,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -449,6 +471,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
remove(inputFile.c_str()); remove(inputFile.c_str());
if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
remove(outputFile.c_str()); remove(outputFile.c_str());
remove(logFile.c_str()); remove(logFile.c_str());
return module; return module;
......
...@@ -388,7 +388,6 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -388,7 +388,6 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::constraints, defines); CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas"); applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
} }
...@@ -396,7 +395,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -396,7 +395,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
cu.clearBuffer(integration.getPosDelta()); cu.clearBuffer(integration.getPosDelta());
integration.applyConstraints(tol); integration.applyConstraints(tol);
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()}; int numAtoms = cu.getNumAtoms();
void* args[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms()); cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
integration.computeVirtualSites(); integration.computeVirtualSites();
} }
...@@ -4156,8 +4156,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle ...@@ -4156,8 +4156,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
cu.setAsCurrent(); cu.setAsCurrent();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1"); kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2"); kernel2 = cu.getKernel(module, "integrateVerletPart2");
...@@ -4168,6 +4166,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4168,6 +4166,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
if (dt != prevStepSize) { if (dt != prevStepSize) {
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
...@@ -4186,7 +4185,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4186,7 +4185,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the first integration kernel. // Call the first integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4196,7 +4195,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4196,7 +4195,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4223,8 +4222,6 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan ...@@ -4223,8 +4222,6 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
...@@ -4236,6 +4233,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4236,6 +4233,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
...@@ -4273,7 +4271,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4273,7 +4271,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4284,7 +4282,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4284,7 +4282,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4308,8 +4306,6 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro ...@@ -4308,8 +4306,6 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
kernel1 = cu.getKernel(module, "integrateBrownianPart1"); kernel1 = cu.getKernel(module, "integrateBrownianPart1");
kernel2 = cu.getKernel(module, "integrateBrownianPart2"); kernel2 = cu.getKernel(module, "integrateBrownianPart2");
...@@ -4320,6 +4316,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4320,6 +4316,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
...@@ -4334,7 +4331,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4334,7 +4331,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {useDouble ? (void*) &tauDt : (void*) &tauDtFloat, void* args1[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
useDouble ? (void*) &noise : (void*) &noiseFloat, useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
...@@ -4347,7 +4344,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4347,7 +4344,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {useDouble ? (void*) &stepSize : (void*) &stepSizeFloat, void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4370,8 +4367,6 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con ...@@ -4370,8 +4367,6 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
cu.setAsCurrent(); cu.setAsCurrent();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1"); kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2"); kernel2 = cu.getKernel(module, "integrateVerletPart2");
...@@ -4383,6 +4378,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4383,6 +4378,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
// Select the step size to use. // Select the step size to use.
...@@ -4391,7 +4387,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4391,7 +4387,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
double tol = integrator.getErrorTolerance(); double tol = integrator.getErrorTolerance();
float tolFloat = (float) tol; float tolFloat = (float) tol;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision(); bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
useDouble ? (void*) &tol : (void*) &tolFloat, useDouble ? (void*) &tol : (void*) &tolFloat,
&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
...@@ -4401,7 +4397,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4401,7 +4397,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Call the first integration kernel. // Call the first integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4411,7 +4407,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4411,7 +4407,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4456,8 +4452,6 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c ...@@ -4456,8 +4452,6 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
...@@ -4471,6 +4465,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4471,6 +4465,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
// Select the step size to use. // Select the step size to use.
...@@ -4483,7 +4478,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4483,7 +4478,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
double kT = BOLTZ*integrator.getTemperature(); double kT = BOLTZ*integrator.getTemperature();
float kTFloat = (float) kT; float kTFloat = (float) kT;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision(); bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
useDouble ? (void*) &tol : (void*) &tolFloat, useDouble ? (void*) &tol : (void*) &tolFloat,
useDouble ? (void*) &tau : (void*) &tauFloat, useDouble ? (void*) &tau : (void*) &tauFloat,
useDouble ? (void*) &kT : (void*) &kTFloat, useDouble ? (void*) &kT : (void*) &kTFloat,
...@@ -4495,7 +4490,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4495,7 +4490,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4506,7 +4501,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4506,7 +4501,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -5369,7 +5364,6 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A ...@@ -5369,7 +5364,6 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A
cu.setAsCurrent(); cu.setAsCurrent();
randomSeed = thermostat.getRandomNumberSeed(); randomSeed = thermostat.getRandomNumberSeed();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines); CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines);
kernel = cu.getKernel(module, "applyAndersenThermostat"); kernel = cu.getKernel(module, "applyAndersenThermostat");
cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed); cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed);
...@@ -5391,7 +5385,8 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) { ...@@ -5391,7 +5385,8 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
float frequency = (float) context.getParameter(AndersenThermostat::CollisionFrequency()); float frequency = (float) context.getParameter(AndersenThermostat::CollisionFrequency());
float kT = (float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature())); float kT = (float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature()));
int randomIndex = cu.getIntegrationUtilities().prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = cu.getIntegrationUtilities().prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args[] = {&frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), int numAtoms = cu.getNumAtoms();
void* args[] = {&numAtoms, &frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getIntegrationUtilities().getRandom().getDevicePointer(), &randomIndex, &atomGroups->getDevicePointer()}; &cu.getIntegrationUtilities().getRandom().getDevicePointer(), &randomIndex, &atomGroups->getDevicePointer()};
cu.executeKernel(kernel, args, cu.getNumAtoms()); cu.executeKernel(kernel, args, cu.getNumAtoms());
} }
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
* Apply the Andersen thermostat to adjust particle velocities. * Apply the Andersen thermostat to adjust particle velocities.
*/ */
extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random, extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
unsigned int randomIndex, const int* __restrict__ atomGroups) { unsigned int randomIndex, const int* __restrict__ atomGroups) {
float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y)); float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
float randomRange = erff(collisionProbability/sqrtf(2.0f)); float randomRange = erff(collisionProbability/sqrtf(2.0f));
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]]; float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index]; float4 velRand = random[randomIndex+index];
......
...@@ -2,16 +2,16 @@ ...@@ -2,16 +2,16 @@
* Perform the first step of Brownian integration. * Perform the first step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force, extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) { mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
randomIndex += blockIdx.x*blockDim.x+threadIdx.x; randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
const mixed fscale = tauDeltaT/(mixed) 0x100000000; const mixed fscale = tauDeltaT/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0) { if (invMass != 0) {
posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x; posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y; posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z; posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
} }
randomIndex += blockDim.x*gridDim.x; randomIndex += blockDim.x*gridDim.x;
} }
...@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm ...@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm
* Perform the second step of Brownian integration. * Perform the second step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) { extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
const mixed oneOverDeltaT = RECIP(deltaT); const mixed oneOverDeltaT = RECIP(deltaT);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
if (velm[index].w != 0) { if (velm[index].w != 0) {
mixed4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x; velm[index].x = oneOverDeltaT*delta.x;
......
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) { extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index]; real4 pos2 = posqCorrection[index];
......
...@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta, extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000; mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
...@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con ...@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0) { if (velocity.w != 0) {
mixed sqrtInvMass = SQRT(velocity.w); mixed sqrtInvMass = SQRT(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y; velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+paddedNumAtoms] + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z; velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+paddedNumAtoms*2] + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0); posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
} }
...@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con ...@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) { extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
...@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real ...@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y; float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif #endif
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0) { if (vel.w != 0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real ...@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt, extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) { const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
// Calculate the error. // Calculate the error.
...@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error ...@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
mixed err = 0; mixed err = 0;
unsigned int index = threadIdx.x; unsigned int index = threadIdx.x;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
...@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error ...@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
if (blockIdx.x*blockDim.x+threadIdx.x == 0) { if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
// Select the new step size. // Select the new step size.
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
* Perform the first step of Verlet integration. * Perform the first step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq, extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) { const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
const mixed2 stepSize = dt[0]; const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y; const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y); const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0x100000000; const mixed scale = dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
real4 pos = posq[index]; real4 pos = posq[index];
#endif #endif
velocity.x += scale*force[index]*velocity.w; velocity.x += scale*force[index]*velocity.w;
velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w; velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w; velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
pos.x = velocity.x*dtPos; pos.x = velocity.x*dtPos;
pos.y = velocity.y*dtPos; pos.y = velocity.y*dtPos;
pos.z = velocity.z*dtPos; pos.z = velocity.z*dtPos;
...@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
* Perform the second step of Verlet integration. * Perform the second step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq, extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) { real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0]; mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
...@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* ...@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0) if (index == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* ...@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) { extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
// Calculate the error. // Calculate the error.
extern __shared__ mixed error[]; extern __shared__ mixed error[];
mixed err = 0.0f; mixed err = 0.0f;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
} }
...@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo ...@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
__syncthreads(); __syncthreads();
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment