Merge pull request #84 from peastman/cache

Cache the PTX files for compiled kernels

Merge pull request #84 from peastman/cache
Cache the PTX files for compiled kernels
818dc73b · peastman · 6144a88b · bf34e947 · 818dc73b · 818dc73b
Commit 818dc73b authored Jul 31, 2013 by peastman
11 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT})
 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert platforms/reference libraries/validate)
+SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference libraries/validate)
 IF(WIN32)
    SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads)
    ADD_CUSTOM_TARGET(PthreadsLibraries ALL)

--- a/libraries/csha1/include/SHA1.h
+++ b/libraries/csha1/include/SHA1.h
+/*
+  100% free public domain implementation of the SHA-1 algorithm
+  by Dominik Reichl <dominik.reichl@t-online.de>
+  Web: http://www.dominik-reichl.de/
+  Version 2.1 - 2012-06-19
+  - Deconstructor (resetting internal variables) is now only
+    implemented if SHA1_WIPE_VARIABLES is defined (which is the
+    default).
+  - Renamed inclusion guard to contain a GUID.
+  - Demo application is now using C++/STL objects and functions.
+  - Unicode build of the demo application now outputs the hashes of both
+    the ANSI and Unicode representations of strings.
+  - Various other demo application improvements.
+  Version 2.0 - 2012-06-14
+  - Added 'limits.h' include.
+  - Renamed inclusion guard and macros for compliancy (names beginning
+    with an underscore are reserved).
+  Version 1.9 - 2011-11-10
+  - Added Unicode test vectors.
+  - Improved support for hashing files using the HashFile method that
+    are larger than 4 GB.
+  - Improved file hashing performance (by using a larger buffer).
+  - Disabled unnecessary compiler warnings.
+  - Internal variables are now private.
+  Version 1.8 - 2009-03-16
+  - Converted project files to Visual Studio 2008 format.
+  - Added Unicode support for HashFile utility method.
+  - Added support for hashing files using the HashFile method that are
+    larger than 2 GB.
+  - HashFile now returns an error code instead of copying an error
+    message into the output buffer.
+  - GetHash now returns an error code and validates the input parameter.
+  - Added ReportHashStl STL utility method.
+  - Added REPORT_HEX_SHORT reporting mode.
+  - Improved Linux compatibility of test program.
+  Version 1.7 - 2006-12-21
+  - Fixed buffer underrun warning that appeared when compiling with
+    Borland C Builder (thanks to Rex Bloom and Tim Gallagher for the
+    patch).
+  - Breaking change: ReportHash writes the final hash to the start
+    of the buffer, i.e. it's not appending it to the string anymore.
+  - Made some function parameters const.
+  - Added Visual Studio 2005 project files to demo project.
+  Version 1.6 - 2005-02-07 (thanks to Howard Kapustein for patches)
+  - You can set the endianness in your files, no need to modify the
+    header file of the CSHA1 class anymore.
+  - Aligned data support.
+  - Made support/compilation of the utility functions (ReportHash and
+    HashFile) optional (useful when bytes count, for example in embedded
+    environments).
+  Version 1.5 - 2005-01-01
+  - 64-bit compiler compatibility added.
+  - Made variable wiping optional (define SHA1_WIPE_VARIABLES).
+  - Removed unnecessary variable initializations.
+  - ROL32 improvement for the Microsoft compiler (using _rotl).
+  Version 1.4 - 2004-07-22
+  - CSHA1 now compiles fine with GCC 3.3 under Mac OS X (thanks to Larry
+    Hastings).
+  Version 1.3 - 2003-08-17
+  - Fixed a small memory bug and made a buffer array a class member to
+    ensure correct working when using multiple CSHA1 class instances at
+    one time.
+  Version 1.2 - 2002-11-16
+  - Borlands C++ compiler seems to have problems with string addition
+    using sprintf. Fixed the bug which caused the digest report function
+    not to work properly. CSHA1 is now Borland compatible.
+  Version 1.1 - 2002-10-11
+  - Removed two unnecessary header file includes and changed BOOL to
+    bool. Fixed some minor bugs in the web page contents.
+  Version 1.0 - 2002-06-20
+  - First official release.
+  ================ Test Vectors ================
+  SHA1("abc" in ANSI) =
+    A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+  SHA1("abc" in Unicode LE) =
+    9F04F41A 84851416 2050E3D6 8C1A7ABB 441DC2B5
+  SHA1("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+    in ANSI) =
+    84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+  SHA1("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+    in Unicode LE) =
+    51D7D876 9AC72C40 9C5B0E3F 69C60ADC 9A039014
+  SHA1(A million repetitions of "a" in ANSI) =
+    34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+  SHA1(A million repetitions of "a" in Unicode LE) =
+    C4609560 A108A0C6 26AA7F2B 38A65566 739353C5
+*/
+#ifndef SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
+#define SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
+#if !defined(SHA1_UTILITY_FUNCTIONS) && !defined(SHA1_NO_UTILITY_FUNCTIONS)
+#define SHA1_UTILITY_FUNCTIONS
+#endif
+#if !defined(SHA1_STL_FUNCTIONS) && !defined(SHA1_NO_STL_FUNCTIONS)
+#define SHA1_STL_FUNCTIONS
+#if !defined(SHA1_UTILITY_FUNCTIONS)
+#error STL functions require SHA1_UTILITY_FUNCTIONS.
+#endif
+#endif
+#include <memory.h>
+#include <limits.h>
+#ifdef SHA1_UTILITY_FUNCTIONS
+#include <stdio.h>
+#include <string.h>
+#endif
+#ifdef SHA1_STL_FUNCTIONS
+#include <string>
+#endif
+#ifdef _MSC_VER
+#include <stdlib.h>
+#endif
+// You can define the endian mode in your files without modifying the SHA-1
+// source files. Just #define SHA1_LITTLE_ENDIAN or #define SHA1_BIG_ENDIAN
+// in your files, before including the SHA1.h header file. If you don't
+// define anything, the class defaults to little endian.
+#if !defined(SHA1_LITTLE_ENDIAN) && !defined(SHA1_BIG_ENDIAN)
+#define SHA1_LITTLE_ENDIAN
+#endif
+// If you want variable wiping, #define SHA1_WIPE_VARIABLES, if not,
+// #define SHA1_NO_WIPE_VARIABLES. If you don't define anything, it
+// defaults to wiping.
+#if !defined(SHA1_WIPE_VARIABLES) && !defined(SHA1_NO_WIPE_VARIABLES)
+#define SHA1_WIPE_VARIABLES
+#endif
+#if defined(SHA1_HAS_TCHAR)
+#include <tchar.h>
+#else
+#ifdef _MSC_VER
+#include <tchar.h>
+#else
+#ifndef TCHAR
+#define TCHAR char
+#endif
+#ifndef _T
+#define _T(__x) (__x)
+#define _tmain main
+#define _tprintf printf
+#define _getts gets
+#define _tcslen strlen
+#define _tfopen fopen
+#define _tcscpy strcpy
+#define _tcscat strcat
+#define _sntprintf snprintf
+#endif
+#endif
+#endif
+///////////////////////////////////////////////////////////////////////////
+// Define variable types
+#ifndef UINT_8
+#ifdef _MSC_VER // Compiling with Microsoft compiler
+#define UINT_8 unsigned __int8
+#else // !_MSC_VER
+#define UINT_8 unsigned char
+#endif // _MSC_VER
+#endif
+#ifndef UINT_32
+#ifdef _MSC_VER // Compiling with Microsoft compiler
+#define UINT_32 unsigned __int32
+#else // !_MSC_VER
+#if (ULONG_MAX == 0xFFFFFFFFUL)
+#define UINT_32 unsigned long
+#else
+#define UINT_32 unsigned int
+#endif
+#endif // _MSC_VER
+#endif // UINT_32
+#ifndef INT_64
+#ifdef _MSC_VER // Compiling with Microsoft compiler
+#define INT_64 __int64
+#else // !_MSC_VER
+#define INT_64 long long
+#endif // _MSC_VER
+#endif // INT_64
+#ifndef UINT_64
+#ifdef _MSC_VER // Compiling with Microsoft compiler
+#define UINT_64 unsigned __int64
+#else // !_MSC_VER
+#define UINT_64 unsigned long long
+#endif // _MSC_VER
+#endif // UINT_64
+///////////////////////////////////////////////////////////////////////////
+// Declare SHA-1 workspace
+typedef union
+{
+	UINT_8 c[64];
+	UINT_32 l[16];
+} SHA1_WORKSPACE_BLOCK;
+class CSHA1
+{
+public:
+#ifdef SHA1_UTILITY_FUNCTIONS
+	// Different formats for ReportHash(Stl)
+	enum REPORT_TYPE
+	{
+		REPORT_HEX = 0,
+		REPORT_DIGIT = 1,
+		REPORT_HEX_SHORT = 2
+	};
+#endif
+	// Constructor and destructor
+	CSHA1();
+#ifdef SHA1_WIPE_VARIABLES
+	~CSHA1();
+#endif
+	void Reset();
+	// Hash in binary data and strings
+	void Update(const UINT_8* pbData, UINT_32 uLen);
+#ifdef SHA1_UTILITY_FUNCTIONS
+	// Hash in file contents
+	bool HashFile(const TCHAR* tszFileName);
+#endif
+	// Finalize hash; call it before using ReportHash(Stl)
+	void Final();
+#ifdef SHA1_UTILITY_FUNCTIONS
+	bool ReportHash(TCHAR* tszReport, REPORT_TYPE rtReportType = REPORT_HEX) const;
+#endif
+#ifdef SHA1_STL_FUNCTIONS
+	bool ReportHashStl(std::basic_string<TCHAR>& strOut, REPORT_TYPE rtReportType =
+		REPORT_HEX) const;
+#endif
+	// Get the raw message digest (20 bytes)
+	bool GetHash(UINT_8* pbDest20) const;
+private:
+	// Private SHA-1 transformation
+	void Transform(UINT_32* pState, const UINT_8* pBuffer);
+	// Member variables
+	UINT_32 m_state[5];
+	UINT_32 m_count[2];
+	UINT_32 m_reserved0[1]; // Memory alignment padding
+	UINT_8 m_buffer[64];
+	UINT_8 m_digest[20];
+	UINT_32 m_reserved1[3]; // Memory alignment padding
+	UINT_8 m_workspace[64];
+	SHA1_WORKSPACE_BLOCK* m_block; // SHA1 pointer to the byte array above
+};
+#endif // SHA1_H_A545E61D43E9404E8D736869AB3CBFE7
--- a/libraries/csha1/src/SHA1.cpp
+++ b/libraries/csha1/src/SHA1.cpp
+/*
+  100% free public domain implementation of the SHA-1 algorithm
+  by Dominik Reichl <dominik.reichl@t-online.de>
+  Web: http://www.dominik-reichl.de/
+  See header file for version history and test vectors.
+*/
+// If compiling with MFC, you might want to add #include "StdAfx.h"
+#define _CRT_SECURE_NO_WARNINGS
+#include "SHA1.h"
+#define SHA1_MAX_FILE_BUFFER (32 * 20 * 820)
+// Rotate p_val32 by p_nBits bits to the left
+#ifndef ROL32
+#ifdef _MSC_VER
+#define ROL32(p_val32,p_nBits) _rotl(p_val32,p_nBits)
+#else
+#define ROL32(p_val32,p_nBits) (((p_val32)<<(p_nBits))|((p_val32)>>(32-(p_nBits))))
+#endif
+#endif
+#ifdef SHA1_LITTLE_ENDIAN
+#define SHABLK0(i) (m_block->l[i] = \
+	(ROL32(m_block->l[i],24) & 0xFF00FF00) | (ROL32(m_block->l[i],8) & 0x00FF00FF))
+#else
+#define SHABLK0(i) (m_block->l[i])
+#endif
+#define SHABLK(i) (m_block->l[i&15] = ROL32(m_block->l[(i+13)&15] ^ \
+	m_block->l[(i+8)&15] ^ m_block->l[(i+2)&15] ^ m_block->l[i&15],1))
+// SHA-1 rounds
+#define S_R0(v,w,x,y,z,i) {z+=((w&(x^y))^y)+SHABLK0(i)+0x5A827999+ROL32(v,5);w=ROL32(w,30);}
+#define S_R1(v,w,x,y,z,i) {z+=((w&(x^y))^y)+SHABLK(i)+0x5A827999+ROL32(v,5);w=ROL32(w,30);}
+#define S_R2(v,w,x,y,z,i) {z+=(w^x^y)+SHABLK(i)+0x6ED9EBA1+ROL32(v,5);w=ROL32(w,30);}
+#define S_R3(v,w,x,y,z,i) {z+=(((w|x)&y)|(w&x))+SHABLK(i)+0x8F1BBCDC+ROL32(v,5);w=ROL32(w,30);}
+#define S_R4(v,w,x,y,z,i) {z+=(w^x^y)+SHABLK(i)+0xCA62C1D6+ROL32(v,5);w=ROL32(w,30);}
+#pragma warning(push)
+// Disable compiler warning 'Conditional expression is constant'
+#pragma warning(disable: 4127)
+CSHA1::CSHA1()
+{
+	m_block = (SHA1_WORKSPACE_BLOCK*)m_workspace;
+	Reset();
+}
+#ifdef SHA1_WIPE_VARIABLES
+CSHA1::~CSHA1()
+{
+	Reset();
+}
+#endif
+void CSHA1::Reset()
+{
+	// SHA1 initialization constants
+	m_state[0] = 0x67452301;
+	m_state[1] = 0xEFCDAB89;
+	m_state[2] = 0x98BADCFE;
+	m_state[3] = 0x10325476;
+	m_state[4] = 0xC3D2E1F0;
+	m_count[0] = 0;
+	m_count[1] = 0;
+}
+void CSHA1::Transform(UINT_32* pState, const UINT_8* pBuffer)
+{
+	UINT_32 a = pState[0], b = pState[1], c = pState[2], d = pState[3], e = pState[4];
+	memcpy(m_block, pBuffer, 64);
+	// 4 rounds of 20 operations each, loop unrolled
+	S_R0(a,b,c,d,e, 0); S_R0(e,a,b,c,d, 1); S_R0(d,e,a,b,c, 2); S_R0(c,d,e,a,b, 3);
+	S_R0(b,c,d,e,a, 4); S_R0(a,b,c,d,e, 5); S_R0(e,a,b,c,d, 6); S_R0(d,e,a,b,c, 7);
+	S_R0(c,d,e,a,b, 8); S_R0(b,c,d,e,a, 9); S_R0(a,b,c,d,e,10); S_R0(e,a,b,c,d,11);
+	S_R0(d,e,a,b,c,12); S_R0(c,d,e,a,b,13); S_R0(b,c,d,e,a,14); S_R0(a,b,c,d,e,15);
+	S_R1(e,a,b,c,d,16); S_R1(d,e,a,b,c,17); S_R1(c,d,e,a,b,18); S_R1(b,c,d,e,a,19);
+	S_R2(a,b,c,d,e,20); S_R2(e,a,b,c,d,21); S_R2(d,e,a,b,c,22); S_R2(c,d,e,a,b,23);
+	S_R2(b,c,d,e,a,24); S_R2(a,b,c,d,e,25); S_R2(e,a,b,c,d,26); S_R2(d,e,a,b,c,27);
+	S_R2(c,d,e,a,b,28); S_R2(b,c,d,e,a,29); S_R2(a,b,c,d,e,30); S_R2(e,a,b,c,d,31);
+	S_R2(d,e,a,b,c,32); S_R2(c,d,e,a,b,33); S_R2(b,c,d,e,a,34); S_R2(a,b,c,d,e,35);
+	S_R2(e,a,b,c,d,36); S_R2(d,e,a,b,c,37); S_R2(c,d,e,a,b,38); S_R2(b,c,d,e,a,39);
+	S_R3(a,b,c,d,e,40); S_R3(e,a,b,c,d,41); S_R3(d,e,a,b,c,42); S_R3(c,d,e,a,b,43);
+	S_R3(b,c,d,e,a,44); S_R3(a,b,c,d,e,45); S_R3(e,a,b,c,d,46); S_R3(d,e,a,b,c,47);
+	S_R3(c,d,e,a,b,48); S_R3(b,c,d,e,a,49); S_R3(a,b,c,d,e,50); S_R3(e,a,b,c,d,51);
+	S_R3(d,e,a,b,c,52); S_R3(c,d,e,a,b,53); S_R3(b,c,d,e,a,54); S_R3(a,b,c,d,e,55);
+	S_R3(e,a,b,c,d,56); S_R3(d,e,a,b,c,57); S_R3(c,d,e,a,b,58); S_R3(b,c,d,e,a,59);
+	S_R4(a,b,c,d,e,60); S_R4(e,a,b,c,d,61); S_R4(d,e,a,b,c,62); S_R4(c,d,e,a,b,63);
+	S_R4(b,c,d,e,a,64); S_R4(a,b,c,d,e,65); S_R4(e,a,b,c,d,66); S_R4(d,e,a,b,c,67);
+	S_R4(c,d,e,a,b,68); S_R4(b,c,d,e,a,69); S_R4(a,b,c,d,e,70); S_R4(e,a,b,c,d,71);
+	S_R4(d,e,a,b,c,72); S_R4(c,d,e,a,b,73); S_R4(b,c,d,e,a,74); S_R4(a,b,c,d,e,75);
+	S_R4(e,a,b,c,d,76); S_R4(d,e,a,b,c,77); S_R4(c,d,e,a,b,78); S_R4(b,c,d,e,a,79);
+	// Add the working vars back into state
+	pState[0] += a;
+	pState[1] += b;
+	pState[2] += c;
+	pState[3] += d;
+	pState[4] += e;
+	// Wipe variables
+#ifdef SHA1_WIPE_VARIABLES
+	a = b = c = d = e = 0;
+#endif
+}
+void CSHA1::Update(const UINT_8* pbData, UINT_32 uLen)
+{
+	UINT_32 j = ((m_count[0] >> 3) & 0x3F);
+	if((m_count[0] += (uLen << 3)) < (uLen << 3))
+		++m_count[1]; // Overflow
+	m_count[1] += (uLen >> 29);
+	UINT_32 i;
+	if((j + uLen) > 63)
+	{
+		i = 64 - j;
+		memcpy(&m_buffer[j], pbData, i);
+		Transform(m_state, m_buffer);
+		for( ; (i + 63) < uLen; i += 64)
+			Transform(m_state, &pbData[i]);
+		j = 0;
+	}
+	else i = 0;
+	if((uLen - i) != 0)
+		memcpy(&m_buffer[j], &pbData[i], uLen - i);
+}
+#ifdef SHA1_UTILITY_FUNCTIONS
+bool CSHA1::HashFile(const TCHAR* tszFileName)
+{
+	if(tszFileName == NULL) return false;
+	FILE* fpIn = _tfopen(tszFileName, _T("rb"));
+	if(fpIn == NULL) return false;
+	UINT_8* pbData = new UINT_8[SHA1_MAX_FILE_BUFFER];
+	if(pbData == NULL) { fclose(fpIn); return false; }
+	bool bSuccess = true;
+	while(true)
+	{
+		const size_t uRead = fread(pbData, 1, SHA1_MAX_FILE_BUFFER, fpIn);
+		if(uRead > 0)
+			Update(pbData, static_cast<UINT_32>(uRead));
+		if(uRead < SHA1_MAX_FILE_BUFFER)
+		{
+			if(feof(fpIn) == 0) bSuccess = false;
+			break;
+		}
+	}
+	fclose(fpIn);
+	delete[] pbData;
+	return bSuccess;
+}
+#endif
+void CSHA1::Final()
+{
+	UINT_32 i;
+	UINT_8 pbFinalCount[8];
+	for(i = 0; i < 8; ++i)
+		pbFinalCount[i] = static_cast<UINT_8>((m_count[((i >= 4) ? 0 : 1)] >>
+			((3 - (i & 3)) * 8) ) & 0xFF); // Endian independent
+	Update((UINT_8*)"\200", 1);
+	while((m_count[0] & 504) != 448)
+		Update((UINT_8*)"\0", 1);
+	Update(pbFinalCount, 8); // Cause a Transform()
+	for(i = 0; i < 20; ++i)
+		m_digest[i] = static_cast<UINT_8>((m_state[i >> 2] >> ((3 -
+			(i & 3)) * 8)) & 0xFF);
+	// Wipe variables for security reasons
+#ifdef SHA1_WIPE_VARIABLES
+	memset(m_buffer, 0, 64);
+	memset(m_state, 0, 20);
+	memset(m_count, 0, 8);
+	memset(pbFinalCount, 0, 8);
+	Transform(m_state, m_buffer);
+#endif
+}
+#ifdef SHA1_UTILITY_FUNCTIONS
+bool CSHA1::ReportHash(TCHAR* tszReport, REPORT_TYPE rtReportType) const
+{
+	if(tszReport == NULL) return false;
+	TCHAR tszTemp[16];
+	if((rtReportType == REPORT_HEX) || (rtReportType == REPORT_HEX_SHORT))
+	{
+		_sntprintf(tszTemp, 15, _T("%02X"), m_digest[0]);
+		_tcscpy(tszReport, tszTemp);
+		const TCHAR* lpFmt = ((rtReportType == REPORT_HEX) ? _T(" %02X") : _T("%02X"));
+		for(size_t i = 1; i < 20; ++i)
+		{
+			_sntprintf(tszTemp, 15, lpFmt, m_digest[i]);
+			_tcscat(tszReport, tszTemp);
+		}
+	}
+	else if(rtReportType == REPORT_DIGIT)
+	{
+		_sntprintf(tszTemp, 15, _T("%u"), m_digest[0]);
+		_tcscpy(tszReport, tszTemp);
+		for(size_t i = 1; i < 20; ++i)
+		{
+			_sntprintf(tszTemp, 15, _T(" %u"), m_digest[i]);
+			_tcscat(tszReport, tszTemp);
+		}
+	}
+	else return false;
+	return true;
+}
+#endif
+#ifdef SHA1_STL_FUNCTIONS
+bool CSHA1::ReportHashStl(std::basic_string<TCHAR>& strOut, REPORT_TYPE rtReportType) const
+{
+	TCHAR tszOut[84];
+	const bool bResult = ReportHash(tszOut, rtReportType);
+	if(bResult) strOut = tszOut;
+	return bResult;
+}
+#endif
+bool CSHA1::GetHash(UINT_8* pbDest20) const
+{
+	if(pbDest20 == NULL) return false;
+	memcpy(pbDest20, m_digest, 20);
+	return true;
+}
+#pragma warning(pop)
\ No newline at end of file
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -515,7 +515,7 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered;
-    std::string compiler, tempDir, gpuArchitecture;
+    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxSize, invPeriodicBoxSize;
    std::string defaultOptimizationOptions;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -35,6 +35,7 @@
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
+#include "SHA1.h"
 #include "hilbert.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/Platform.h"
@@ -44,6 +45,7 @@
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <sstream>
 #include <typeinfo>
@@ -90,10 +92,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }
    else
        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
+    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
+    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
 #ifdef WIN32
    this->tempDir = tempDir+"\\";
+    cacheDir = cacheDir+"\\";
 #else
    this->tempDir = tempDir+"/";
+    cacheDir = cacheDir+"/";
 #endif
    contextIndex = platformData.contexts.size();
    int numDevices;
@@ -350,6 +356,7 @@ static bool compileInWindows(const string &command) {
 #endif
 CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
+    string bits = intToString(8*sizeof(void*));
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
    stringstream src;
    if (!options.empty())
@@ -397,6 +404,23 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << endl;
    src << source << endl;
+    // See whether we already have PTX for this kernel cached.
+    CSHA1 sha1;
+    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
+    sha1.Final();
+    UINT_8 hash[20];
+    sha1.GetHash(hash);
+    stringstream cacheFile;
+    cacheFile << cacheDir;
+    cacheFile.flags(ios::hex);
+    for (int i = 0; i < 20; i++)
+        cacheFile << setw(2) << setfill('0') << (int) hash[i];
+    cacheFile << '_' << gpuArchitecture << '_' << bits;
+    CUmodule module;
+    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
+        return module;
    // Write out the source to a temporary file.
    stringstream tempFileName;
@@ -412,7 +436,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    ofstream out(inputFile.c_str());
    out << src.str();
    out.close();
-    string bits = intToString(8*sizeof(void*));
 #ifdef WIN32
 #ifdef _DEBUG
    string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
@@ -441,7 +464,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
            }
            throw OpenMMException(error.str());
        }
-        CUmodule module;
        CUresult result = cuModuleLoad(&module, outputFile.c_str());
        if (result != CUDA_SUCCESS) {
            std::stringstream m;
@@ -449,6 +471,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
            throw OpenMMException(m.str());
        }
        remove(inputFile.c_str());
+        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
            remove(outputFile.c_str());
        remove(logFile.c_str());
        return module;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -388,7 +388,6 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
        map<string, string> defines;
-        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
    }
@@ -396,7 +395,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
    cu.clearBuffer(integration.getPosDelta());
    integration.applyConstraints(tol);
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
+    int numAtoms = cu.getNumAtoms();
+    void* args[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
    cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
    integration.computeVirtualSites();
 }
@@ -4156,8 +4156,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
    cu.getPlatformData().initializeContexts(system);
    cu.setAsCurrent();
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
    kernel1 = cu.getKernel(module, "integrateVerletPart1");
    kernel2 = cu.getKernel(module, "integrateVerletPart2");
@@ -4168,6 +4166,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
    double dt = integrator.getStepSize();
    if (dt != prevStepSize) {
        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
@@ -4186,7 +4185,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    // Call the first integration kernel.
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel1, args1, numAtoms);
@@ -4196,7 +4195,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    // Call the second integration kernel.
-    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+    void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4223,8 +4222,6 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
@@ -4236,6 +4233,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
@@ -4273,7 +4271,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
    // Call the first integration kernel.
    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
    cu.executeKernel(kernel1, args1, numAtoms);
@@ -4284,7 +4282,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
    // Call the second integration kernel.
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
+    void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4308,8 +4306,6 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
    kernel1 = cu.getKernel(module, "integrateBrownianPart1");
    kernel2 = cu.getKernel(module, "integrateBrownianPart2");
@@ -4320,6 +4316,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
@@ -4334,7 +4331,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
    // Call the first integration kernel.
    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args1[] = {useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
+    void* args1[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
            useDouble ? (void*) &noise : (void*) &noiseFloat,
            &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
@@ -4347,7 +4344,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
    // Call the second integration kernel.
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args2[] = {useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
+    void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
            &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4370,8 +4367,6 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con
    cu.getPlatformData().initializeContexts(system);
    cu.setAsCurrent();
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
    kernel1 = cu.getKernel(module, "integrateVerletPart1");
    kernel2 = cu.getKernel(module, "integrateVerletPart2");
@@ -4383,6 +4378,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
    // Select the step size to use.
@@ -4391,7 +4387,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    double tol = integrator.getErrorTolerance();
    float tolFloat = (float) tol;
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
-    void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
+    void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
            useDouble ? (void*) &tol : (void*) &tolFloat,
            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
@@ -4401,7 +4397,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    // Call the first integration kernel.
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel1, args1, numAtoms);
@@ -4411,7 +4407,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    // Call the second integration kernel.
-    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+    void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4456,8 +4452,6 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
@@ -4471,6 +4465,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
    // Select the step size to use.
@@ -4483,7 +4478,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    double kT = BOLTZ*integrator.getTemperature();
    float kTFloat = (float) kT;
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
-    void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
+    void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
            useDouble ? (void*) &tol : (void*) &tolFloat,
            useDouble ? (void*) &tau : (void*) &tauFloat,
            useDouble ? (void*) &kT : (void*) &kTFloat,
@@ -4495,7 +4490,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    // Call the first integration kernel.
    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
    cu.executeKernel(kernel1, args1, numAtoms);
@@ -4506,7 +4501,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    // Call the second integration kernel.
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
+    void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -5369,7 +5364,6 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A
    cu.setAsCurrent();
    randomSeed = thermostat.getRandomNumberSeed();
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines);
    kernel = cu.getKernel(module, "applyAndersenThermostat");
    cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed);
@@ -5391,7 +5385,8 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
    float frequency = (float) context.getParameter(AndersenThermostat::CollisionFrequency());
    float kT = (float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature()));
    int randomIndex = cu.getIntegrationUtilities().prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args[] = {&frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
+    int numAtoms = cu.getNumAtoms();
+    void* args[] = {&numAtoms, &frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
            &cu.getIntegrationUtilities().getRandom().getDevicePointer(), &randomIndex, &atomGroups->getDevicePointer()};
    cu.executeKernel(kernel, args, cu.getNumAtoms());
 }

--- a/platforms/cuda/src/kernels/andersenThermostat.cu
+++ b/platforms/cuda/src/kernels/andersenThermostat.cu
@@ -2,11 +2,11 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */
-extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
+extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
        unsigned int randomIndex, const int* __restrict__ atomGroups) {
    float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
    float randomRange = erff(collisionProbability/sqrtf(2.0f));
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];

--- a/platforms/cuda/src/kernels/brownian.cu
+++ b/platforms/cuda/src/kernels/brownian.cu
@@ -2,16 +2,16 @@
 * Perform the first step of Brownian integration.
 */
-extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
+extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
        mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
    const mixed fscale = tauDeltaT/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed invMass = velm[index].w;
        if (invMass != 0) {
            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
-            posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
+            posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
-            posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
+            posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
        }
        randomIndex += blockDim.x*gridDim.x;
    }
@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm
 * Perform the second step of Brownian integration.
 */
-extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
+extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
    const mixed oneOverDeltaT = RECIP(deltaT);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        if (velm[index].w != 0) {
            mixed4 delta = posDelta[index];
            velm[index].x = oneOverDeltaT*delta.x;

--- a/platforms/cuda/src/kernels/constraints.cu
+++ b/platforms/cuda/src/kernels/constraints.cu
-extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
+extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
 #ifdef USE_MIXED_PRECISION
        real4 pos1 = posq[index];
        real4 pos2 = posqCorrection[index];

--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
 * Perform the first step of Langevin integration.
 */
-extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
        const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
    mixed vscale = paramBuffer[VelScale];
    mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    randomIndex += index;
-    while (index < NUM_ATOMS) {
+    while (index < numAtoms) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0) {
            mixed sqrtInvMass = SQRT(velocity.w);
            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
-            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+paddedNumAtoms] + noisescale*sqrtInvMass*random[randomIndex].y;
-            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
+            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+paddedNumAtoms*2] + noisescale*sqrtInvMass*random[randomIndex].z;
            velm[index] = velocity;
            posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
        }
@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
 * Perform the second step of Langevin integration.
 */
-extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
+extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
 #if __CUDA_ARCH__ >= 130
    double invStepSize = 1.0/dt[0].y;
 #else
@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
 #endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    while (index < NUM_ATOMS) {
+    while (index < numAtoms) {
        mixed4 vel = velm[index];
        if (vel.w != 0) {
 #ifdef USE_MIXED_PRECISION
@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
 * Select the step size to use for the next step.
 */
-extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
+extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
        const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
    // Calculate the error.
@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    mixed err = 0;
    unsigned int index = threadIdx.x;
    const mixed scale = RECIP((mixed) 0x100000000);
-    while (index < NUM_ATOMS) {
+    while (index < numAtoms) {
-        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
        index += blockDim.x*gridDim.x;
@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.
-        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed totalError = SQRT(error[0]/(numAtoms*3));
        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -2,13 +2,13 @@
 * Perform the first step of Verlet integration.
 */
-extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
    const mixed2 stepSize = dt[0];
    const mixed dtPos = stepSize.y;
    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
    const mixed scale = dtVel/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
            real4 pos = posq[index];
 #endif
            velocity.x += scale*force[index]*velocity.w;
-            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
+            velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
-            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
+            velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
            pos.x = velocity.x*dtPos;
            pos.y = velocity.y*dtPos;
            pos.z = velocity.z*dtPos;
@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
 * Perform the second step of Verlet integration.
 */
-extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
+extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
    mixed2 stepSize = dt[0];
 #if __CUDA_ARCH__ >= 130
@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
-    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
 * Select the step size to use for the next step.
 */
-extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
+extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
    // Calculate the error.
    extern __shared__ mixed error[];
    mixed err = 0.0f;
    const mixed scale = RECIP((mixed) 0x100000000);
-    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
    }
@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed totalError = SQRT(error[0]/(numAtoms*3));
        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)