Fix for isolated ions

41abd9fb · Mark Friedrichs · 761d7e17 · 41abd9fb · 41abd9fb · 41abd9fb
Commit 41abd9fb authored Apr 07, 2011 by Mark Friedrichs
7 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -93,7 +93,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if( mapIterator == contextToAmoebaDataMap.end() ){
        amoebaCudaData                         = new AmoebaCudaData( cudaPlatformData );
        contextToAmoebaDataMap[&context]       = amoebaCudaData;
-        amoebaCudaData->setLog( stderr );
+        //amoebaCudaData->setLog( stderr );
        amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
    } else {
        amoebaCudaData                         = mapIterator->second;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -86,13 +86,14 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
 extern "C"
 void gpuPrintCudaStream( std::string name,
                         unsigned int length, unsigned int subStreams, unsigned int stride,
+                         unsigned int memoryFootprint,
                         void*  pSysStream, void* pDevStream,
                         void*  pSysData,   void* pDevData, FILE* log)
 {
   
-    (void) fprintf( log, "     %-35s [%8u %5u %8u] Stream[%p %p] Data[%16p %16p]\n",
+    (void) fprintf( log, "     %-35s [%8u %5u %8u %8u] Stream[%p %p] Data[%16p %16p]\n",
                    name.c_str(), length, subStreams,
-                    stride, pSysStream, pDevStream, pSysData, pDevData );
+                    stride, memoryFootprint, pSysStream, pDevStream, pSysData, pDevData );
 }

 extern "C"
@@ -102,6 +103,7 @@ void gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -113,6 +115,7 @@ void gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -124,6 +127,7 @@ void gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -135,6 +139,7 @@ void gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE*
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -146,6 +151,7 @@ void gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -157,6 +163,7 @@ void gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -168,6 +175,7 @@ void gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
    if( cUDAStream == NULL )return;
    gpuPrintCudaStream( cUDAStream->_name.c_str(),
                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
+                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ),
                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
 }
@@ -1277,15 +1285,6 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )
   
    int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms=%d\n",
-                        methodName.c_str(), paddedNumberOfAtoms ); (void) fflush( amoebaGpu->log );
-    }
-#endif
-
-    // work space
- 
    // parameters
 
    amoebaGpu->psMultipoleParticlesIdsAndAxisType                      = new CUDAStream<int4>(paddedNumberOfAtoms,    1, "MultipoleParticlesIdsAndAxisType");
@@ -1296,9 +1295,11 @@ static void gpuRotationToLabFrameAllocate( amoebaGpuContext amoebaGpu )

    amoebaGpu->psMolecularDipole                                       = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "MolecularDipole");
    amoebaGpu->amoebaSim.pMolecularDipole                              = amoebaGpu->psMolecularDipole->_pDevData;
+    memset( amoebaGpu->psMolecularDipole->_pSysData,      0, sizeof(float)*3*paddedNumberOfAtoms );

    amoebaGpu->psMolecularQuadrupole                                   = new CUDAStream<float>(9*paddedNumberOfAtoms, 1, "MolecularQuadrupole");
    amoebaGpu->amoebaSim.pMolecularQuadrupole                          = amoebaGpu->psMolecularQuadrupole->_pDevData;
+    memset( amoebaGpu->psMolecularQuadrupole->_pSysData,      0, sizeof(float)*9*paddedNumberOfAtoms );
 
    // output
 

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -47,38 +47,10 @@ typedef MapIntFloat::const_iterator MapIntFloatCI;
 struct _amoebaGpuContext {
    
    _gpuContext* gpuContext;
+    cudaAmoebaGmxSimulation amoebaSim;
 
    FILE* log;

-    //bool bOutputBufferPerWarp;
-    //unsigned int paddedNumberOfAtoms;
-    //unsigned int nonbondBlocks;
-    //unsigned int nonbondThreadsPerBlock;
-    //unsigned int nonbondOutputBuffers;
-    //unsigned int threadsPerBlock;
-    //unsigned int fieldReduceThreadsPerBlock;
-    //unsigned int outputBuffers; 
-    unsigned int workUnits; 
-
-    // workspace arrays
-
-    CUDAStream<float>*  psWorkArray_3_1; 
-    CUDAStream<float>*  psWorkArray_3_2; 
-    CUDAStream<float>*  psWorkArray_3_3; 
-    CUDAStream<float>*  psWorkArray_3_4; 
-
-    CUDAStream<float>*  psWorkArray_1_1; 
-    CUDAStream<float>*  psWorkArray_1_2; 
-
-    CUDAStream<unsigned int>*  psWorkUnit; 
-    CUDAStream<int>*  psScalingIndicesIndex; 
-    CUDAStream<int>*  ps_D_ScaleIndices; 
-    CUDAStream<int2>* ps_P_ScaleIndices; 
-    CUDAStream<int2>* ps_M_ScaleIndices; 
-
-    cudaAmoebaGmxSimulation amoebaSim;
-    int maxCovalentDegreeSz;
-
    CUDAStream<int4>*   psAmoebaBondID;
    CUDAStream<float2>* psAmoebaBondParameter;

@@ -116,6 +88,25 @@ struct _amoebaGpuContext {
    CUDAStream<int4>*   psAmoebaTorsionTorsionID3;
    CUDAStream<float4>* psAmoebaTorsionTorsionGrids;

+    unsigned int workUnits; 
+
+    // workspace arrays
+
+    CUDAStream<float>*  psWorkArray_3_1; 
+    CUDAStream<float>*  psWorkArray_3_2; 
+    CUDAStream<float>*  psWorkArray_3_3; 
+    CUDAStream<float>*  psWorkArray_3_4; 
+
+    CUDAStream<float>*  psWorkArray_1_1; 
+    CUDAStream<float>*  psWorkArray_1_2; 
+
+    CUDAStream<unsigned int>*  psWorkUnit; 
+    CUDAStream<int>*  psScalingIndicesIndex; 
+    CUDAStream<int>*  ps_D_ScaleIndices; 
+    CUDAStream<int2>* ps_P_ScaleIndices; 
+    CUDAStream<int2>* ps_M_ScaleIndices; 
+
+    int maxCovalentDegreeSz;
    float solventDielectric;

    // multipole parameters
@@ -126,7 +117,6 @@ struct _amoebaGpuContext {
    // buffer indices used for mapping torques onto forces 

    int maxTorqueBufferIndex;
-    int useNewTorqueMapScheme;
    int torqueMapForce4Delete;
    CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
    CUDAStream<float4>*  psTorqueMapForce4; 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
@@ -6,7 +6,8 @@
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-#define AMOEBA_DEBUG
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG

 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
@@ -480,7 +481,7 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

        // write results to file

-        if( 1 ){
+        if( 0 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1730,7 +1730,7 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
    }
    LAUNCHERROR("kReduceToBornForcePrefactor");

-#define AMOEBA_DEBUG
+//#define AMOEBA_DEBUG
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

@@ -1748,7 +1748,7 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
        }
        (void) fflush( amoebaGpu->log );
 */
-        if( 1 ){
+        if( 0 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
@@ -1943,7 +1943,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

-    if( 1 ){
+    if( 0 ){
        std::vector<int> fileId;
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
@@ -1956,7 +1956,7 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    kCalculateObcGbsaForces2( amoebaGpu->gpuContext );

-    if( 1 ){
+    if( 0 ){
        std::vector<int> fileId;
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -33,8 +33,8 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-#define AMOEBA_DEBUG
-//#undef AMOEBA_DEBUG
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG

 #define GK
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
@@ -923,7 +923,7 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    }

 #ifdef AMOEBA_DEBUG
-    if( 1 ){
+    if( 0 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -90,8 +90,8 @@ void kCudaComputeCheckChiral_kernel( void )
    { 
        // skip z-then-x
    
-        int axisType                     = multiPoleParticles[particleIndex].w; 
-        if( axisType != 0 && multiPoleParticles[particleIndex].y >= 0 )
+        int axisType             = multiPoleParticles[particleIndex].w; 
+        if( axisType != 0 && multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].y >=0 && multiPoleParticles[particleIndex].z >= 0 )
        {
     
            // ---------------------------------------------------------------------------------------
@@ -164,204 +164,213 @@ void kCudaComputeLabFrameMoments_kernel( void )
    
    while( particleIndex < numberOfParticles )
    {
-        float4 coordinatesThisParticle   = particleCoord[particleIndex];
-     
-        int multipoleParticleIndex       = multiPoleParticles[particleIndex].z;
-        float4 coordinatesAxisParticle   = particleCoord[multipoleParticleIndex];
-     
-        vectorZ[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
-        vectorZ[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
-        vectorZ[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
-          
-        multipoleParticleIndex           = multiPoleParticles[particleIndex].x; 
-        coordinatesAxisParticle          = particleCoord[multipoleParticleIndex];
-     
-        vectorX[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
-        vectorX[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
-        vectorX[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
-     
-        int axisType                     = multiPoleParticles[particleIndex].w; 
-          
-        /*
-            z-only
-               (1) norm z
-               (2) select random x
-               (3) x = x - (x.z)z
-               (4) norm x
-    
-            z-then-x
-               (1) norm z
-               (2) norm x (not needed)
-               (3) x = x - (x.z)z
-               (4) norm x
-    
-            bisector
-               (1) norm z
-               (2) norm x 
-               (3) z = x + z
-               (4) norm z
-               (5) x = x - (x.z)z 
-               (6) norm x 
-    
-            z-bisect
-               (1) norm z
-               (2) norm x 
-               (3) norm y 
-               (3) x = x + y
-               (4) norm x
-               (5) x = x - (x.z)z 
-               (6) norm x 
-    
-            3-fold
-               (1) norm z
-               (2) norm x 
-               (3) norm y 
-               (4) z = x + y + z
-               (5) norm z
-               (6) x = x - (x.z)z 
-               (7) norm x 
-    
-        */
-    
-        // branch based on axis type
+
+        if( multiPoleParticles[particleIndex].x >= 0 && multiPoleParticles[particleIndex].z >= 0 )
+        {
+            float4 coordinatesThisParticle   = particleCoord[particleIndex];
         
-        float sum                        = normVector3( vectorZ );
-    
-        if( axisType == 1 ){
+            int multipoleParticleIndex       = multiPoleParticles[particleIndex].z;
+            float4 coordinatesAxisParticle   = particleCoord[multipoleParticleIndex];
+         
+            vectorZ[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+            vectorZ[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+            vectorZ[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
+              
+            multipoleParticleIndex           = multiPoleParticles[particleIndex].x; 
+            coordinatesAxisParticle          = particleCoord[multipoleParticleIndex];
+         
+            vectorX[0]                       = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+            vectorX[1]                       = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+            vectorX[2]                       = coordinatesAxisParticle.z - coordinatesThisParticle.z;
+         
+            int axisType                     = multiPoleParticles[particleIndex].w; 
    
-            // bisector
+            /*
+                z-only
+                   (1) norm z
+                   (2) select random x
+                   (3) x = x - (x.z)z
+                   (4) norm x
+        
+                z-then-x
+                   (1) norm z
+                   (2) norm x (not needed)
+                   (3) x = x - (x.z)z
+                   (4) norm x
+        
+                bisector
+                   (1) norm z
+                   (2) norm x 
+                   (3) z = x + z
+                   (4) norm z
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+        
+                z-bisect
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (3) x = x + y
+                   (4) norm x
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+        
+                3-fold
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (4) z = x + y + z
+                   (5) norm z
+                   (6) x = x - (x.z)z 
+                   (7) norm x 
+        
+            */
+        
+            // branch based on axis type
+             
+            float sum                   = normVector3( vectorZ );
+        
+            if( axisType == 1 ){
+        
+                // bisector
+                
+                sum                     = normVector3( vectorX );
+                
+                vectorZ[0]             += vectorX[0];
+                vectorZ[1]             += vectorX[1];
+                vectorZ[2]             += vectorX[2];
+           
+                sum                     = normVector3( vectorZ );
+        
+            } else if( axisType == 2 || axisType == 3 ){ 
+         
+                // z-bisect
+        
+                multipoleParticleIndex  = multiPoleParticles[particleIndex].y; 
+                if( multipoleParticleIndex >= 0 && multipoleParticleIndex < cSim.atoms ){
+                    coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
+                    vectorY[0]              = coordinatesAxisParticle.x - coordinatesThisParticle.x;
+                    vectorY[1]              = coordinatesAxisParticle.y - coordinatesThisParticle.y;
+                    vectorY[2]              = coordinatesAxisParticle.z - coordinatesThisParticle.z;
            
-            sum                     = normVector3( vectorX );
+                    sum                     = normVector3( vectorY );
+                    sum                     = normVector3( vectorX );
            
-            vectorZ[0]             += vectorX[0];
-            vectorZ[1]             += vectorX[1];
-            vectorZ[2]             += vectorX[2];
-       
-            sum                     = normVector3( vectorZ );
-    
-        } else if( axisType == 2 || axisType == 3 ){ 
-     
-            // z-bisect
-    
-            multipoleParticleIndex  = multiPoleParticles[particleIndex].y; 
-            coordinatesAxisParticle = particleCoord[multipoleParticleIndex];
-            vectorY[0]              = coordinatesAxisParticle.x - coordinatesThisParticle.x;
-            vectorY[1]              = coordinatesAxisParticle.y - coordinatesThisParticle.y;
-            vectorY[2]              = coordinatesAxisParticle.z - coordinatesThisParticle.z;
-    
-            sum                     = normVector3( vectorY );
-            sum                     = normVector3( vectorX );
-    
-            if( axisType == 2 ){
-    
-                vectorX[0]         += vectorY[0];
-                vectorX[1]         += vectorY[1];
-                vectorX[2]         += vectorY[2];
-                sum                 = normVector3( vectorX );
-     
-            } else { 
-     
-                // 3-fold
+                    if( axisType == 2 ){
+            
+                        vectorX[0]         += vectorY[0];
+                        vectorX[1]         += vectorY[1];
+                        vectorX[2]         += vectorY[2];
+                        sum                 = normVector3( vectorX );
+             
+                    } else { 
+             
+                        // 3-fold
+                
+                        vectorZ[0]         += vectorX[0] + vectorY[0];
+                        vectorZ[1]         += vectorX[1] + vectorY[1];
+                        vectorZ[2]         += vectorX[2] + vectorY[2];
+                        sum                 = normVector3( vectorZ );
+                    }
+                }
+         
+            } else if( axisType >= 4 ){ 
        
-                vectorZ[0]         += vectorX[0] + vectorY[0];
-                vectorZ[1]         += vectorX[1] + vectorY[1];
-                vectorZ[2]         += vectorX[2] + vectorY[2];
-                sum                 = normVector3( vectorZ );
+                vectorX[0]             = 0.1f;
+                vectorX[1]             = 0.1f;
+                vectorX[2]             = 0.1f;
            }
-     
-        } else if( axisType >= 4 ){ 
-    
-            vectorX[0]             = 0.1f;
-            vectorX[1]             = 0.1f;
-            vectorX[2]             = 0.1f;
-        }
+            
+            // x = x - (x.z)z
+        
+            float dot         = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2];
+                
+            vectorX[0]       -= dot*vectorZ[0];
+            vectorX[1]       -= dot*vectorZ[1];
+            vectorX[2]       -= dot*vectorZ[2];
+             
+            sum               = normVector3( vectorX );
+        
+            vectorY[0]        = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]);
+            vectorY[1]        = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]);
+            vectorY[2]        = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]);
+         
+            // use identity rotation matrix for unrecognized axis types
+        
+            if( axisType < 0 || axisType > 4 ){
        
-        // x = x - (x.z)z
+                vectorX[0] = 1.0f;
+                vectorX[1] = 0.0f;
+                vectorX[2] = 0.0f;
+        
+                vectorY[0] = 0.0f;
+                vectorY[1] = 1.0f;
+                vectorY[2] = 0.0f;
+        
+                vectorZ[0] = 0.0f;
+                vectorZ[1] = 0.0f;
+                vectorZ[2] = 1.0f;
+            }
+        
+            unsigned int offset           = 3*particleIndex;
    
-        float dot         = vectorZ[0]*vectorX[0] + vectorZ[1]*vectorX[1] + vectorZ[2]*vectorX[2];
+            float molDipole[3];
+            molDipole[0]                  = labFrameDipole[offset];
+            molDipole[1]                  = labFrameDipole[offset+1];
+            molDipole[2]                  = labFrameDipole[offset+2];
            
-        vectorX[0]       -= dot*vectorZ[0];
-        vectorX[1]       -= dot*vectorZ[1];
-        vectorX[2]       -= dot*vectorZ[2];
+            // set out-of-range elements to 0.0f
         
-        sum               = normVector3( vectorX );
-    
-        vectorY[0]        = (vectorZ[1]*vectorX[2]) - (vectorZ[2]*vectorX[1]);
-        vectorY[1]        = (vectorZ[2]*vectorX[0]) - (vectorZ[0]*vectorX[2]);
-        vectorY[2]        = (vectorZ[0]*vectorX[1]) - (vectorZ[1]*vectorX[0]);
-     
-        // use identity rotation matrix for unrecognized axis types
+            labFrameDipole[offset]        = molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
+            labFrameDipole[offset+1]      = molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
+            labFrameDipole[offset+2]      = molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
+            
+            // ---------------------------------------------------------------------------------------
+            
+            float mPole[3][3];
+            offset                        = 9*particleIndex;
+            
+            mPole[0][0]                   = labFrameQuadrupole[offset];
+            mPole[0][1]                   = labFrameQuadrupole[offset+1];
+            mPole[0][2]                   = labFrameQuadrupole[offset+2];
+        
+            mPole[1][0]                   = labFrameQuadrupole[offset+3];
+            mPole[1][1]                   = labFrameQuadrupole[offset+4];
+            mPole[1][2]                   = labFrameQuadrupole[offset+5];
+        
+            mPole[2][0]                   = labFrameQuadrupole[offset+6];
+            mPole[2][1]                   = labFrameQuadrupole[offset+7];
+            mPole[2][2]                   = labFrameQuadrupole[offset+8];
+        
+            labFrameQuadrupole[offset+8]  = vectorX[2]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+            labFrameQuadrupole[offset+8] += vectorY[2]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+            labFrameQuadrupole[offset+8] += vectorZ[2]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
    
-        if( axisType < 0 || axisType > 4 ){
+            labFrameQuadrupole[offset+4]  = vectorX[1]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
+            labFrameQuadrupole[offset+4] += vectorY[1]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
+            labFrameQuadrupole[offset+4] += vectorZ[1]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
    
-            vectorX[0] = 1.0f;
-            vectorX[1] = 0.0f;
-            vectorX[2] = 0.0f;
+            labFrameQuadrupole[offset+5]  = vectorX[1]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+            labFrameQuadrupole[offset+5] += vectorY[1]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+            labFrameQuadrupole[offset+5] += vectorZ[1]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
    
-            vectorY[0] = 0.0f;
-            vectorY[1] = 1.0f;
-            vectorY[2] = 0.0f;
+            labFrameQuadrupole[offset]    = vectorX[0]*(vectorX[0]*mPole[0][0] + vectorY[0]*mPole[0][1] + vectorZ[0]*mPole[0][2]);
+            labFrameQuadrupole[offset]   += vectorY[0]*(vectorX[0]*mPole[1][0] + vectorY[0]*mPole[1][1] + vectorZ[0]*mPole[1][2]);
+            labFrameQuadrupole[offset]   += vectorZ[0]*(vectorX[0]*mPole[2][0] + vectorY[0]*mPole[2][1] + vectorZ[0]*mPole[2][2]);
    
-            vectorZ[0] = 0.0f;
-            vectorZ[1] = 0.0f;
-            vectorZ[2] = 1.0f;
-        }
+            labFrameQuadrupole[offset+1]  = vectorX[0]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
+            labFrameQuadrupole[offset+1] += vectorY[0]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
+            labFrameQuadrupole[offset+1] += vectorZ[0]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
    
-        float molDipole[3];
-        molDipole[0]                            = labFrameDipole[particleIndex*3];
-        molDipole[1]                            = labFrameDipole[particleIndex*3+1];
-        molDipole[2]                            = labFrameDipole[particleIndex*3+2];
-        
-        // set out-of-range elements to 0.0f
+            labFrameQuadrupole[offset+2]  = vectorX[0]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
+            labFrameQuadrupole[offset+2] += vectorY[0]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
+            labFrameQuadrupole[offset+2] += vectorZ[0]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
     
-        labFrameDipole[particleIndex*3]         = molDipole[0]*vectorX[0] + molDipole[1]*vectorY[0] + molDipole[2]*vectorZ[0];
-        labFrameDipole[particleIndex*3+1]       = molDipole[0]*vectorX[1] + molDipole[1]*vectorY[1] + molDipole[2]*vectorZ[1];
-        labFrameDipole[particleIndex*3+2]       = molDipole[0]*vectorX[2] + molDipole[1]*vectorY[2] + molDipole[2]*vectorZ[2];
-        
-        // ---------------------------------------------------------------------------------------
-        
-        float mPole[3][3];
-        unsigned int offset           = particleIndex*9;
-        
-        mPole[0][0]                   = labFrameQuadrupole[offset];
-        mPole[0][1]                   = labFrameQuadrupole[offset+1];
-        mPole[0][2]                   = labFrameQuadrupole[offset+2];
+            labFrameQuadrupole[offset+3]  = labFrameQuadrupole[offset+1];
+            labFrameQuadrupole[offset+6]  = labFrameQuadrupole[offset+2];
+            labFrameQuadrupole[offset+7]  = labFrameQuadrupole[offset+5];
    
-        mPole[1][0]                   = labFrameQuadrupole[offset+3];
-        mPole[1][1]                   = labFrameQuadrupole[offset+4];
-        mPole[1][2]                   = labFrameQuadrupole[offset+5];
-    
-        mPole[2][0]                   = labFrameQuadrupole[offset+6];
-        mPole[2][1]                   = labFrameQuadrupole[offset+7];
-        mPole[2][2]                   = labFrameQuadrupole[offset+8];
-    
-        labFrameQuadrupole[offset+8]  = vectorX[2]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
-        labFrameQuadrupole[offset+8] += vectorY[2]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
-        labFrameQuadrupole[offset+8] += vectorZ[2]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
-
-        labFrameQuadrupole[offset+4]  = vectorX[1]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
-        labFrameQuadrupole[offset+4] += vectorY[1]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
-        labFrameQuadrupole[offset+4] += vectorZ[1]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
-
-        labFrameQuadrupole[offset+5]  = vectorX[1]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
-        labFrameQuadrupole[offset+5] += vectorY[1]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
-        labFrameQuadrupole[offset+5] += vectorZ[1]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
-
-        labFrameQuadrupole[offset]    = vectorX[0]*(vectorX[0]*mPole[0][0] + vectorY[0]*mPole[0][1] + vectorZ[0]*mPole[0][2]);
-        labFrameQuadrupole[offset]   += vectorY[0]*(vectorX[0]*mPole[1][0] + vectorY[0]*mPole[1][1] + vectorZ[0]*mPole[1][2]);
-        labFrameQuadrupole[offset]   += vectorZ[0]*(vectorX[0]*mPole[2][0] + vectorY[0]*mPole[2][1] + vectorZ[0]*mPole[2][2]);
-
-        labFrameQuadrupole[offset+1]  = vectorX[0]*(vectorX[1]*mPole[0][0] + vectorY[1]*mPole[0][1] + vectorZ[1]*mPole[0][2]);
-        labFrameQuadrupole[offset+1] += vectorY[0]*(vectorX[1]*mPole[1][0] + vectorY[1]*mPole[1][1] + vectorZ[1]*mPole[1][2]);
-        labFrameQuadrupole[offset+1] += vectorZ[0]*(vectorX[1]*mPole[2][0] + vectorY[1]*mPole[2][1] + vectorZ[1]*mPole[2][2]);
-
-        labFrameQuadrupole[offset+2]  = vectorX[0]*(vectorX[2]*mPole[0][0] + vectorY[2]*mPole[0][1] + vectorZ[2]*mPole[0][2]);
-        labFrameQuadrupole[offset+2] += vectorY[0]*(vectorX[2]*mPole[1][0] + vectorY[2]*mPole[1][1] + vectorZ[2]*mPole[1][2]);
-        labFrameQuadrupole[offset+2] += vectorZ[0]*(vectorX[2]*mPole[2][0] + vectorY[2]*mPole[2][1] + vectorZ[2]*mPole[2][2]);
- 
-        labFrameQuadrupole[offset+3]  = labFrameQuadrupole[offset+1];
-        labFrameQuadrupole[offset+6]  = labFrameQuadrupole[offset+2];
-        labFrameQuadrupole[offset+7]  = labFrameQuadrupole[offset+5];
+        }

        particleIndex                += gridDim.x*blockDim.x;
    }
@@ -384,28 +393,24 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )

 //#define AMOEBA_DEBUG  
 #ifdef AMOEBA_DEBUG
-    if( 0 && amoebaGpu->log ){
+    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log );
        amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download();
        amoebaGpu->psMolecularDipole->Download();
        gpu->psPosq4->Download();
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            int mIndex = 3*ii;
-             (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d] x[%16.9e %16.9e %16.9e] dpl[%16.9e %16.9e %16.9e]\nRot[%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n\n", ii,
+             (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s\n", ii,
                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
+                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
                             gpu->psPosq4->_pSysData[ii].x,
                             gpu->psPosq4->_pSysData[ii].y,
-                             gpu->psPosq4->_pSysData[ii].z,
-                             amoebaGpu->psMolecularDipole->_pSysData[mIndex],
-                             amoebaGpu->psMolecularDipole->_pSysData[mIndex+1],
-                             amoebaGpu->psMolecularDipole->_pSysData[mIndex+2] );
-            if( ii == 30 )ii = gpu->natoms - 30;
+                             gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : "") );
+            //if( ii == 30 )ii = gpu->natoms - 30;
        }
    }
-//    int64 kernelTime = AmoebaTiming::getTimeOfDay();
-    double kernelTime = 0.0;
 #endif

    // copy molecular moments to lab frame moment arrays
@@ -431,7 +436,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG

    cudaComputeAmoebaLabFrameMoments( amoebaGpu );

-    if( 1 ){
+    if( 0 ){
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        //fileId.push_back( 0 );